From 84a5f0e2eb84bd02a09e00d30d888f162a49e84b Mon Sep 17 00:00:00 2001
From: Pablo Romero <promero@mathworks.com>
Date: Fri, 26 Aug 2022 11:44:11 +0200
Subject: [PATCH 001/154] Fixes #3743.

---
 common.h             | 4 ++--
 ctest/CMakeLists.txt | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/common.h b/common.h
index 00d1d0baf..e6002d322 100644
--- a/common.h
+++ b/common.h
@@ -90,7 +90,7 @@ extern "C" {
 #endif
 #include <time.h>
 
-#ifdef OS_LINUX
+#if defined(OS_LINUX) || defined(OS_QNX)
 #include <malloc.h>
 #include <sched.h>
 #endif
@@ -107,7 +107,7 @@ extern "C" {
 #endif
 #endif
 
-#ifdef OS_HAIKU
+#if defined(OS_HAIKU) || defined(OS_QNX)
 #define NO_SYSV_IPC
 #endif
 
diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt
index e779fb168..91338b73b 100644
--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@@ -40,7 +40,7 @@ else()
     c_${float_char}blas1.c)
 endif()
   target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
-  if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
+  if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
       target_link_libraries(x${float_char}cblat1 m)
   endif()
   add_test(NAME "x${float_char}cblat1"
@@ -65,7 +65,7 @@ else()
     constant.c)
 endif()
   target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
-  if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
+  if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
     target_link_libraries(x${float_char}cblat2 m)
   endif()
   add_test(NAME "x${float_char}cblat2"
@@ -90,7 +90,7 @@ else()
     constant.c)
 endif()
   target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
-  if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
+  if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
     target_link_libraries(x${float_char}cblat3 m)
   endif()
   add_test(NAME "x${float_char}cblat3"

From 1b1f781cf986376cb28020d6e5dab9c35b40919e Mon Sep 17 00:00:00 2001
From: Pablo Romero <promero@mathworks.com>
Date: Fri, 26 Aug 2022 11:45:23 +0200
Subject: [PATCH 002/154] Added name and details to contributors' list.

---
 CONTRIBUTORS.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 1714d90c8..f5e9dda91 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -211,4 +211,5 @@ In chronological order:
 * PLCT Lab, Institute of Software Chinese Academy of Sciences
   * [2022-03] Support RISC-V Vector Intrinisc 1.0 version.
   
-  
\ No newline at end of file
+* Pablo Romero <https://github.com/pablorcum>
+  * [2022-08] Fix building from sources for QNX
\ No newline at end of file

From e15f810a023da3f93fbff9552182d07c94bb849f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 29 Aug 2022 17:31:57 +0200
Subject: [PATCH 003/154] Avoid spurious version queries and associated expr
 errors in the NOFORTRAN case

---
 Makefile.x86_64 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index f14a8a8ff..d5e9cbfc7 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -143,6 +143,7 @@ ifeq ($(C_COMPILER), CLANG)
 CCOMMON_OPT += -mavx2
 endif
 endif
+ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 ifeq ($(F_COMPILER), GFORTRAN)
 # AVX2 support was added in 4.7.0
 GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
@@ -159,6 +160,7 @@ endif
 endif
 endif
 endif
+endif
 
 endif
 

From 68277282df4adaafaf9b4a01c2eeb629eed99528 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 30 Aug 2022 22:26:16 +0200
Subject: [PATCH 004/154] Work around XCode assembler SVE bug

---
 Makefile.arm64 | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile.arm64 b/Makefile.arm64
index 4efa55286..2ef0caa8b 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -124,7 +124,11 @@ ifeq ($(CORE), NEOVERSEN2)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
 ifeq ($(GCCVERSIONGTEQ9), 1)
 ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
+ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
+else
+CCOMMON_OPT += -march=armv8.5-a+sve -mtune=neoverse-n2
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 endif

From ae3bcc8949cfaa8f37cfba864971227dc972fd96 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 31 Aug 2022 10:41:01 +0200
Subject: [PATCH 005/154] Drop NeoverseN2 to armv8.2-a on OSX to make it build
 with gcc11 too

---
 Makefile.arm64 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.arm64 b/Makefile.arm64
index 2ef0caa8b..480684422 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -127,7 +127,7 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
 ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 else
-CCOMMON_OPT += -march=armv8.5-a+sve -mtune=neoverse-n2
+CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
 endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2

From 594ceeceda042e18265289d78cb31cbf82e41fa3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20M=C3=BCtzel?= <markus.muetzel@gmx.de>
Date: Thu, 1 Sep 2022 14:35:12 +0200
Subject: [PATCH 006/154] CI (MSYS2): Configure with
 `-DCMAKE_BUILD_TYPE=Release`.

---
 .github/workflows/dynamic_arch.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index c34b0c462..418250675 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -235,7 +235,8 @@ jobs:
       - name: Configure OpenBLAS
         run: |
           mkdir build && cd build
-          cmake -DBUILD_SHARED_LIBS=ON \
+          cmake -DCMAKE_BUILD_TYPE=Release \
+                -DBUILD_SHARED_LIBS=ON \
                 -DBUILD_STATIC_LIBS=ON \
                 -DDYNAMIC_ARCH=ON \
                 -DUSE_THREAD=ON \
@@ -258,6 +259,7 @@ jobs:
         timeout-minutes: 60
         run: cd build && ctest
 
+
   cross_build:
     runs-on: ubuntu-22.04
 

From c4d7ce338412bcfc4c6143d307d5b9547cb26db0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20M=C3=BCtzel?= <markus.muetzel@gmx.de>
Date: Thu, 1 Sep 2022 18:08:39 +0200
Subject: [PATCH 007/154] CI (MSYS2): Add one runner with
 `-DCMAKE_BUILD_TYPE=None`.

---
 .github/workflows/dynamic_arch.yml | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index 418250675..669e41059 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -150,6 +150,7 @@ jobs:
       matrix:
         msystem: [MINGW64, MINGW32, CLANG64]
         idx: [int32, int64]
+        build-type: [Release]
         include:
           - msystem: MINGW64
             idx: int32
@@ -173,6 +174,11 @@ jobs:
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
             target-prefix: mingw-w64-clang-x86_64
             c-lapack-flags: -DC_LAPACK=ON
+          - msystem: MINGW64
+            idx: int32
+            target-prefix: mingw-w64-x86_64
+            fc-pkg: mingw-w64-x86_64-gcc-fortran
+            build-type: None
         exclude:
           - msystem: MINGW32
             idx: int64
@@ -215,11 +221,11 @@ jobs:
           path: C:/msys64/home/runneradmin/.ccache
           # We include the commit sha in the cache key, as new cache entries are
           # only created if there is no existing entry for the key yet.
-          key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}-${{ github.sha }}
+          key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
           # Restore a matching ccache cache entry. Prefer same branch.
           restore-keys: |
-            ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}
-            ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}
+            ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
+            ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}
 
       - name: Configure ccache
         # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
@@ -235,7 +241,7 @@ jobs:
       - name: Configure OpenBLAS
         run: |
           mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Release \
+          cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
                 -DBUILD_SHARED_LIBS=ON \
                 -DBUILD_STATIC_LIBS=ON \
                 -DDYNAMIC_ARCH=ON \

From 41e51dbc1b8a334394e425cd9e70757aac9c0cb6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 2 Sep 2022 13:07:51 +0200
Subject: [PATCH 008/154] add target for mips xbuild

---
 .github/workflows/dynamic_arch.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index c34b0c462..1814e9e56 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -267,7 +267,7 @@ jobs:
         include:
           - target: mips64el
             triple: mips64el-linux-gnuabi64
-            opts: DYNAMIC_ARCH=1
+            opts: DYNAMIC_ARCH=1 TARGET=GENERIC
           - target: riscv64
             triple: riscv64-linux-gnu
             opts: TARGET=RISCV64_GENERIC

From 992a9222ffe7dd78fdad5ff1e0e32d11e9469d5a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 3 Sep 2022 09:56:25 +0200
Subject: [PATCH 009/154] Move all Apple jobs on Azure to macos-11 following
 deprecation

---
 azure-pipelines.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 1102bf0f5..67a343d8a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -141,7 +141,7 @@ jobs:
 
 - job: OSX_OpenMP
   pool:
-     vmImage: 'macOS-10.15'
+     vmImage: 'macOS-11'
   steps:   
   - script: |
       brew update
@@ -151,7 +151,7 @@ jobs:
      
 - job: OSX_GCC_Nothreads
   pool:
-     vmImage: 'macOS-10.15'
+     vmImage: 'macOS-11'
   steps:   
   - script: |
       brew update
@@ -159,7 +159,7 @@ jobs:
      
 - job: OSX_OpenMP_Clang
   pool:
-     vmImage: 'macOS-10.15'
+     vmImage: 'macOS-11'
   variables:
      LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
      LIBRARY_PATH: /usr/local/opt/llvm/lib
@@ -172,7 +172,7 @@ jobs:
 
 - job: OSX_OpenMP_Clang_cmake
   pool:
-     vmImage: 'macOS-10.15'
+     vmImage: 'macOS-11'
   variables:
      LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
      LIBRARY_PATH: /usr/local/opt/llvm/lib
@@ -188,7 +188,7 @@ jobs:
       
 - job: OSX_dynarch_cmake
   pool:
-     vmImage: 'macOS-10.15'
+     vmImage: 'macOS-11'
   variables:
      LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
      LIBRARY_PATH: /usr/local/opt/llvm/lib
@@ -202,7 +202,7 @@ jobs:
 
 - job: OSX_Ifort_Clang
   pool:
-     vmImage: 'macOS-10.15'
+     vmImage: 'macOS-11'
   variables:
      LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
      MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
@@ -235,7 +235,7 @@ jobs:
  
 - job: OSX_NDK_ARMV7
   pool:
-     vmImage: 'macOS-10.15'
+     vmImage: 'macOS-11'
   steps:   
   - script: | 
       brew update
@@ -255,7 +255,7 @@ jobs:
 
 - job: OSX_IOS_ARMV7
   pool:
-     vmImage: 'macOS-10.15'
+     vmImage: 'macOS-11'
   variables:
      CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
      CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1

From 739c3c44a77d87d1b08de59bf868250683c0f755 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 3 Sep 2022 15:01:22 +0200
Subject: [PATCH 010/154] Work around windows/osx gcc12 x86_64 tree-optimizer
 problem and add an osx/gcc12 build to Azure CI (#3745)

Add pragma to disable the gcc tree-optimizer for some x86_64 S and Z kernels with gcc12 on OSX or Windows
---
 azure-pipelines.yml       | 8 ++++++++
 kernel/x86_64/sgemv_n_4.c | 5 ++++-
 kernel/x86_64/sgemv_t_4.c | 5 ++++-
 kernel/x86_64/ssymv_L.c   | 5 ++++-
 kernel/x86_64/ssymv_U.c   | 5 ++++-
 kernel/x86_64/zdot.c      | 4 +++-
 kernel/x86_64/zgemv_n_4.c | 9 +++------
 kernel/x86_64/zgemv_t_4.c | 4 +++-
 8 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 1102bf0f5..8236c6cc3 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -157,6 +157,14 @@ jobs:
       brew update
       make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
      
+- job: OSX_GCC12
+  pool:
+     vmImage: 'macOS-latest'
+  steps:   
+  - script: |
+      brew update
+      make CC=gcc-12 FC=gfortran-12
+     
 - job: OSX_OpenMP_Clang
   pool:
      vmImage: 'macOS-10.15'
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 621ddc622..c9681fa8b 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 
+#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
+
 
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "sgemv_n_microk_bulldozer-4.c"
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 0be2c7e97..07aa51503 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 
+#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
+
 #if defined(NEHALEM)
 #include "sgemv_t_microk_nehalem-4.c"
 #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c
index 29d6a9958..45914daf5 100644
--- a/kernel/x86_64/ssymv_L.c
+++ b/kernel/x86_64/ssymv_L.c
@@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 
+#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
+
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "ssymv_L_microk_bulldozer-2.c"
 #elif defined(NEHALEM)
diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c
index 02bbc1c64..26e5ca7e9 100644
--- a/kernel/x86_64/ssymv_U.c
+++ b/kernel/x86_64/ssymv_U.c
@@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 
+#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
+
 
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "ssymv_U_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index c52575d07..27397ccfa 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -25,9 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 
+#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
 
 #if defined(BULLDOZER) 
 #include "zdot_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c
index 2d6866a78..8fc960610 100644
--- a/kernel/x86_64/zgemv_n_4.c
+++ b/kernel/x86_64/zgemv_n_4.c
@@ -25,10 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdlib.h>
-#include <stdio.h>
 #include "common.h"
 
+#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
 
 #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "zgemv_n_microk_haswell-4.c"
@@ -231,10 +232,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
 	FLOAT xbuffer[8],*ybuffer;
 
 
-#if 0
-printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
-#endif
-
 	if ( m < 1 ) return(0);
 	if ( n < 1 ) return(0);
 
diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c
index c2791e0f3..63c8b11a4 100644
--- a/kernel/x86_64/zgemv_t_4.c
+++ b/kernel/x86_64/zgemv_t_4.c
@@ -25,9 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-
 #include "common.h"
 
+#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
 
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER)  || defined(EXCAVATOR)
 #include "zgemv_t_microk_bulldozer-4.c"

From 389e378063720639447757a20cd61274521e0573 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 7 Sep 2022 09:01:03 +0200
Subject: [PATCH 011/154] Remove excessive quoting of arguments from PR3722

---
 Makefile.prebuild | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile.prebuild b/Makefile.prebuild
index 5dd7dfa4e..0be4f1274 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -60,9 +60,9 @@ all: getarch_2nd
 	./getarch_2nd  1 >> $(TARGET_CONF)
 
 $(TARGET_CONF): c_check$(SCRIPTSUFFIX) f_check$(SCRIPTSUFFIX) getarch
-	./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)"
+	./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" $(TARGET_FLAGS) $(CFLAGS)
 ifneq ($(ONLY_CBLAS), 1)
-	./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" "$(TARGET_FLAGS)"
+	./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" $(TARGET_FLAGS)
 else
 #When we only build CBLAS, we set NOFORTRAN=2
 	echo "NOFORTRAN=2" >> $(TARGET_MAKE)
@@ -77,8 +77,8 @@ endif
 
 
 getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
-	avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | grep NO_AVX512); \
-	rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | grep NO_RV64GV); \
+	avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \
+	rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \
 	$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
 
 getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy

From 365936ae1b1dfa2f50b3e65c68ae95babc6f2af2 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 13 Sep 2022 16:38:01 +0800
Subject: [PATCH 012/154] MIPS64: Using the macro MTC rather than MTC1

---
 kernel/mips64/dnrm2.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S
index 0ccc781e1..cd40414a2 100644
--- a/kernel/mips64/dnrm2.S
+++ b/kernel/mips64/dnrm2.S
@@ -90,7 +90,7 @@
 	//Init INF
 	lui     TEMP, 0x7FF0
 	dsll    TEMP, TEMP, 32
-	MTC1    TEMP, INF
+	MTC     TEMP, INF
 
 	LD	a1,  0 * SIZE(X)
 	daddiu	N, N, -1

From 23d59baaf19094a9e70721f4549c78f0a1c2f9a8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 13 Sep 2022 22:39:27 +0200
Subject: [PATCH 013/154] Add -mfma to -mavx2 for Apple clang, and set AVX2
 options for Zen as well

---
 kernel/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/Makefile b/kernel/Makefile
index cbe4cde6e..977886044 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,7 +23,7 @@ ifeq ($(C_COMPILER), CLANG)
 # Any clang posing as gcc 4.2 should be new enough (3.4 or later)
   GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2)
   ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
-   AVX2OPT = -mavx2
+   AVX2OPT = -mavx2 -mfma
   endif
 endif
 ifdef NO_AVX2
@@ -73,6 +73,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX)
  endif
 else ifeq ($(TARGET_CORE), HASWELL)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
+else ifeq ($(TARGET_CORE), ZEN)
+ override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
 else ifeq ($(TARGET_CORE), LOONGSON3R4)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
 else

From a0a4f7c44759e9e4705f0fb1e42d8c8c7c0c68b6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 13 Sep 2022 22:47:00 +0200
Subject: [PATCH 014/154] Add -mfma to -mavx2 for clang, and add AVX2
 declaration for Zen in DYNAMIC_ARCH builds

---
 cmake/system.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index a9fc0f4b7..fd68f79d6 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -197,14 +197,14 @@ if (DEFINED TARGET)
   if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
     set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
   endif()
-  if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2)
+  if ((${TARGET} STREQUAL HASWELL OR ${TARGET} STREQUAL ZEN) AND NOT NO_AVX2)
     if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
       execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
       if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
         set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
       endif()
     elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2 -mfma")
     endif()
   endif()
   if (DEFINED HAVE_AVX)

From 515cf269291bec0d43651fe7bf99a71fb074a0ad Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 14 Sep 2022 11:48:36 +0200
Subject: [PATCH 015/154] Fix pointer/integer argument mismatch in calls to
 pow()

---
 lapack-netlib/SRC/claed0.c | 4 ++--
 lapack-netlib/SRC/claed7.c | 4 ++--
 lapack-netlib/SRC/clalsa.c | 6 +++---
 lapack-netlib/SRC/cstedc.c | 4 ++--
 lapack-netlib/SRC/dlaed0.c | 4 ++--
 lapack-netlib/SRC/dlaed7.c | 4 ++--
 lapack-netlib/SRC/dlaeda.c | 8 ++++----
 lapack-netlib/SRC/dlalsa.c | 6 +++---
 lapack-netlib/SRC/dlasd0.c | 2 +-
 lapack-netlib/SRC/dlasda.c | 4 ++--
 lapack-netlib/SRC/dstedc.c | 4 ++--
 lapack-netlib/SRC/slaed0.c | 4 ++--
 lapack-netlib/SRC/slaed7.c | 4 ++--
 lapack-netlib/SRC/slaeda.c | 8 ++++----
 lapack-netlib/SRC/slalsa.c | 6 +++---
 lapack-netlib/SRC/slasd0.c | 2 +-
 lapack-netlib/SRC/slasda.c | 4 ++--
 lapack-netlib/SRC/sstedc.c | 4 ++--
 lapack-netlib/SRC/zlaed0.c | 4 ++--
 lapack-netlib/SRC/zlaed7.c | 4 ++--
 lapack-netlib/SRC/zlalsa.c | 6 +++---
 lapack-netlib/SRC/zstedc.c | 4 ++--
 22 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/lapack-netlib/SRC/claed0.c b/lapack-netlib/SRC/claed0.c
index 21e408397..2b696508e 100644
--- a/lapack-netlib/SRC/claed0.c
+++ b/lapack-netlib/SRC/claed0.c
@@ -796,10 +796,10 @@ L10:
 
     temp = log((real) (*n)) / log(2.f);
     lgn = (integer) temp;
-    if (pow_ii(&c__2, &lgn) < *n) {
+    if (pow_ii(c__2, lgn) < *n) {
 	++lgn;
     }
-    if (pow_ii(&c__2, &lgn) < *n) {
+    if (pow_ii(c__2, lgn) < *n) {
 	++lgn;
     }
     iprmpt = indxq + *n + 1;
diff --git a/lapack-netlib/SRC/claed7.c b/lapack-netlib/SRC/claed7.c
index 49fc9ed4b..1eaa7e9c2 100644
--- a/lapack-netlib/SRC/claed7.c
+++ b/lapack-netlib/SRC/claed7.c
@@ -864,11 +864,11 @@ f"> */
 /*     Form the z-vector which consists of the last row of Q_1 and the */
 /*     first row of Q_2. */
 
-    ptr = pow_ii(&c__2, tlvls) + 1;
+    ptr = pow_ii(c__2, *tlvls) + 1;
     i__1 = *curlvl - 1;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	i__2 = *tlvls - i__;
-	ptr += pow_ii(&c__2, &i__2);
+	ptr += pow_ii(c__2, i__2);
 /* L10: */
     }
     curr = ptr + *curpbm;
diff --git a/lapack-netlib/SRC/clalsa.c b/lapack-netlib/SRC/clalsa.c
index 4bc3830a9..2ef3e1231 100644
--- a/lapack-netlib/SRC/clalsa.c
+++ b/lapack-netlib/SRC/clalsa.c
@@ -1051,7 +1051,7 @@ f"> */
 /*     Finally go through the left singular vector matrices of all */
 /*     the other subproblems bottom-up on the tree. */
 
-    j = pow_ii(&c__2, &nlvl);
+    j = pow_ii(c__2, nlvl);
     sqre = 0;
 
     for (lvl = nlvl; lvl >= 1; --lvl) {
@@ -1065,7 +1065,7 @@ f"> */
 	    ll = 1;
 	} else {
 	    i__1 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__1);
+	    lf = pow_ii(c__2, i__1);
 	    ll = (lf << 1) - 1;
 	}
 	i__1 = ll;
@@ -1110,7 +1110,7 @@ L170:
 	    ll = 1;
 	} else {
 	    i__2 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__2);
+	    lf = pow_ii(c__2, i__2);
 	    ll = (lf << 1) - 1;
 	}
 	i__2 = lf;
diff --git a/lapack-netlib/SRC/cstedc.c b/lapack-netlib/SRC/cstedc.c
index 437c39e96..8f047d1ce 100644
--- a/lapack-netlib/SRC/cstedc.c
+++ b/lapack-netlib/SRC/cstedc.c
@@ -836,10 +836,10 @@ f"> */
 	    lrwmin = *n - 1 << 1;
 	} else if (icompz == 1) {
 	    lgn = (integer) (log((real) (*n)) / log(2.f));
-	    if (pow_ii(&c__2, &lgn) < *n) {
+	    if (pow_ii(c__2, lgn) < *n) {
 		++lgn;
 	    }
-	    if (pow_ii(&c__2, &lgn) < *n) {
+	    if (pow_ii(c__2, lgn) < *n) {
 		++lgn;
 	    }
 	    lwmin = *n * *n;
diff --git a/lapack-netlib/SRC/dlaed0.c b/lapack-netlib/SRC/dlaed0.c
index 95e39b0df..74e58dd2d 100644
--- a/lapack-netlib/SRC/dlaed0.c
+++ b/lapack-netlib/SRC/dlaed0.c
@@ -827,10 +827,10 @@ L10:
 
 	temp = log((doublereal) (*n)) / log(2.);
 	lgn = (integer) temp;
-	if (pow_ii(&c__2, &lgn) < *n) {
+	if (pow_ii(c__2, lgn) < *n) {
 	    ++lgn;
 	}
-	if (pow_ii(&c__2, &lgn) < *n) {
+	if (pow_ii(c__2, lgn) < *n) {
 	    ++lgn;
 	}
 	iprmpt = indxq + *n + 1;
diff --git a/lapack-netlib/SRC/dlaed7.c b/lapack-netlib/SRC/dlaed7.c
index fd8515261..d23a72be0 100644
--- a/lapack-netlib/SRC/dlaed7.c
+++ b/lapack-netlib/SRC/dlaed7.c
@@ -885,11 +885,11 @@ f"> */
 /*     Form the z-vector which consists of the last row of Q_1 and the */
 /*     first row of Q_2. */
 
-    ptr = pow_ii(&c__2, tlvls) + 1;
+    ptr = pow_ii(c__2, *tlvls) + 1;
     i__1 = *curlvl - 1;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	i__2 = *tlvls - i__;
-	ptr += pow_ii(&c__2, &i__2);
+	ptr += pow_ii(c__2, i__2);
 /* L10: */
     }
     curr = ptr + *curpbm;
diff --git a/lapack-netlib/SRC/dlaeda.c b/lapack-netlib/SRC/dlaeda.c
index f4bb214d3..202e1b636 100644
--- a/lapack-netlib/SRC/dlaeda.c
+++ b/lapack-netlib/SRC/dlaeda.c
@@ -754,7 +754,7 @@ f"> */
 /*     scheme */
 
     i__1 = *curlvl - 1;
-    curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1;
+    curr = ptr + *curpbm * pow_ii(c__2, *curlvl) + pow_ii(c__2, i__1) - 1;
 
 /*     Determine size of these matrices.  We add HALF to the value of */
 /*     the SQRT in case the machine underestimates one of these square */
@@ -781,12 +781,12 @@ f"> */
 /*     rotations and permutation and then multiplying the center matrices */
 /*     against the current Z. */
 
-    ptr = pow_ii(&c__2, tlvls) + 1;
+    ptr = pow_ii(c__2, *tlvls) + 1;
     i__1 = *curlvl - 1;
     for (k = 1; k <= i__1; ++k) {
 	i__2 = *curlvl - k;
 	i__3 = *curlvl - k - 1;
-	curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) - 
+	curr = ptr + *curpbm * pow_ii(c__2, i__2) + pow_ii(c__2, i__3) - 
 		1;
 	psiz1 = prmptr[curr + 1] - prmptr[curr];
 	psiz2 = prmptr[curr + 2] - prmptr[curr + 1];
@@ -847,7 +847,7 @@ f"> */
 		c__1);
 
 	i__2 = *tlvls - k;
-	ptr += pow_ii(&c__2, &i__2);
+	ptr += pow_ii(c__2, i__2);
 /* L70: */
     }
 
diff --git a/lapack-netlib/SRC/dlalsa.c b/lapack-netlib/SRC/dlalsa.c
index 891ed66a8..4d5c347c3 100644
--- a/lapack-netlib/SRC/dlalsa.c
+++ b/lapack-netlib/SRC/dlalsa.c
@@ -951,7 +951,7 @@ f"> */
 /*     Finally go through the left singular vector matrices of all */
 /*     the other subproblems bottom-up on the tree. */
 
-    j = pow_ii(&c__2, &nlvl);
+    j = pow_ii(c__2, nlvl);
     sqre = 0;
 
     for (lvl = nlvl; lvl >= 1; --lvl) {
@@ -965,7 +965,7 @@ f"> */
 	    ll = 1;
 	} else {
 	    i__1 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__1);
+	    lf = pow_ii(c__2, i__1);
 	    ll = (lf << 1) - 1;
 	}
 	i__1 = ll;
@@ -1010,7 +1010,7 @@ L50:
 	    ll = 1;
 	} else {
 	    i__2 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__2);
+	    lf = pow_ii(c__2, i__2);
 	    ll = (lf << 1) - 1;
 	}
 	i__2 = lf;
diff --git a/lapack-netlib/SRC/dlasd0.c b/lapack-netlib/SRC/dlasd0.c
index c702665b0..0f88527ef 100644
--- a/lapack-netlib/SRC/dlasd0.c
+++ b/lapack-netlib/SRC/dlasd0.c
@@ -824,7 +824,7 @@ f"> */
 	    ll = 1;
 	} else {
 	    i__1 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__1);
+	    lf = pow_ii(c__2, i__1);
 	    ll = (lf << 1) - 1;
 	}
 	i__1 = ll;
diff --git a/lapack-netlib/SRC/dlasda.c b/lapack-netlib/SRC/dlasda.c
index 72f9d55f3..a9190f805 100644
--- a/lapack-netlib/SRC/dlasda.c
+++ b/lapack-netlib/SRC/dlasda.c
@@ -1027,7 +1027,7 @@ f"> */
 
 /*     Now conquer each subproblem bottom-up. */
 
-    j = pow_ii(&c__2, &nlvl);
+    j = pow_ii(c__2, nlvl);
     for (lvl = nlvl; lvl >= 1; --lvl) {
 	lvl2 = (lvl << 1) - 1;
 
@@ -1039,7 +1039,7 @@ f"> */
 	    ll = 1;
 	} else {
 	    i__1 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__1);
+	    lf = pow_ii(c__2, i__1);
 	    ll = (lf << 1) - 1;
 	}
 	i__1 = ll;
diff --git a/lapack-netlib/SRC/dstedc.c b/lapack-netlib/SRC/dstedc.c
index ef2eeabe8..56511d6cf 100644
--- a/lapack-netlib/SRC/dstedc.c
+++ b/lapack-netlib/SRC/dstedc.c
@@ -806,10 +806,10 @@ f"> */
 	    lwmin = *n - 1 << 1;
 	} else {
 	    lgn = (integer) (log((doublereal) (*n)) / log(2.));
-	    if (pow_ii(&c__2, &lgn) < *n) {
+	    if (pow_ii(c__2, lgn) < *n) {
 		++lgn;
 	    }
-	    if (pow_ii(&c__2, &lgn) < *n) {
+	    if (pow_ii(c__2, lgn) < *n) {
 		++lgn;
 	    }
 	    if (icompz == 1) {
diff --git a/lapack-netlib/SRC/slaed0.c b/lapack-netlib/SRC/slaed0.c
index 33f7134c1..4c5230907 100644
--- a/lapack-netlib/SRC/slaed0.c
+++ b/lapack-netlib/SRC/slaed0.c
@@ -823,10 +823,10 @@ L10:
 
 	temp = log((real) (*n)) / log(2.f);
 	lgn = (integer) temp;
-	if (pow_ii(&c__2, &lgn) < *n) {
+	if (pow_ii(c__2, lgn) < *n) {
 	    ++lgn;
 	}
-	if (pow_ii(&c__2, &lgn) < *n) {
+	if (pow_ii(c__2, lgn) < *n) {
 	    ++lgn;
 	}
 	iprmpt = indxq + *n + 1;
diff --git a/lapack-netlib/SRC/slaed7.c b/lapack-netlib/SRC/slaed7.c
index 210d796d1..22fcaf76d 100644
--- a/lapack-netlib/SRC/slaed7.c
+++ b/lapack-netlib/SRC/slaed7.c
@@ -883,11 +883,11 @@ f"> */
 /*     Form the z-vector which consists of the last row of Q_1 and the */
 /*     first row of Q_2. */
 
-    ptr = pow_ii(&c__2, tlvls) + 1;
+    ptr = pow_ii(c__2, *tlvls) + 1;
     i__1 = *curlvl - 1;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	i__2 = *tlvls - i__;
-	ptr += pow_ii(&c__2, &i__2);
+	ptr += pow_ii(c__2, i__2);
 /* L10: */
     }
     curr = ptr + *curpbm;
diff --git a/lapack-netlib/SRC/slaeda.c b/lapack-netlib/SRC/slaeda.c
index 7edaf8a76..3806427c2 100644
--- a/lapack-netlib/SRC/slaeda.c
+++ b/lapack-netlib/SRC/slaeda.c
@@ -753,7 +753,7 @@ f"> */
 /*     scheme */
 
     i__1 = *curlvl - 1;
-    curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1;
+    curr = ptr + *curpbm * pow_ii(c__2, *curlvl) + pow_ii(c__2, i__1) - 1;
 
 /*     Determine size of these matrices.  We add HALF to the value of */
 /*     the SQRT in case the machine underestimates one of these square */
@@ -779,12 +779,12 @@ f"> */
 /*     rotations and permutation and then multiplying the center matrices */
 /*     against the current Z. */
 
-    ptr = pow_ii(&c__2, tlvls) + 1;
+    ptr = pow_ii(c__2, *tlvls) + 1;
     i__1 = *curlvl - 1;
     for (k = 1; k <= i__1; ++k) {
 	i__2 = *curlvl - k;
 	i__3 = *curlvl - k - 1;
-	curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) - 
+	curr = ptr + *curpbm * pow_ii(c__2, i__2) + pow_ii(c__2, i__3) - 
 		1;
 	psiz1 = prmptr[curr + 1] - prmptr[curr];
 	psiz2 = prmptr[curr + 2] - prmptr[curr + 1];
@@ -844,7 +844,7 @@ f"> */
 		c__1);
 
 	i__2 = *tlvls - k;
-	ptr += pow_ii(&c__2, &i__2);
+	ptr += pow_ii(c__2, i__2);
 /* L70: */
     }
 
diff --git a/lapack-netlib/SRC/slalsa.c b/lapack-netlib/SRC/slalsa.c
index 53da2c7bf..77a79b80c 100644
--- a/lapack-netlib/SRC/slalsa.c
+++ b/lapack-netlib/SRC/slalsa.c
@@ -946,7 +946,7 @@ f"> */
 /*     Finally go through the left singular vector matrices of all */
 /*     the other subproblems bottom-up on the tree. */
 
-    j = pow_ii(&c__2, &nlvl);
+    j = pow_ii(c__2, nlvl);
     sqre = 0;
 
     for (lvl = nlvl; lvl >= 1; --lvl) {
@@ -960,7 +960,7 @@ f"> */
 	    ll = 1;
 	} else {
 	    i__1 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__1);
+	    lf = pow_ii(c__2, i__1);
 	    ll = (lf << 1) - 1;
 	}
 	i__1 = ll;
@@ -1005,7 +1005,7 @@ L50:
 	    ll = 1;
 	} else {
 	    i__2 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__2);
+	    lf = pow_ii(c__2, i__2);
 	    ll = (lf << 1) - 1;
 	}
 	i__2 = lf;
diff --git a/lapack-netlib/SRC/slasd0.c b/lapack-netlib/SRC/slasd0.c
index aa553579e..be1a74191 100644
--- a/lapack-netlib/SRC/slasd0.c
+++ b/lapack-netlib/SRC/slasd0.c
@@ -821,7 +821,7 @@ f"> */
 	    ll = 1;
 	} else {
 	    i__1 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__1);
+	    lf = pow_ii(c__2, i__1);
 	    ll = (lf << 1) - 1;
 	}
 	i__1 = ll;
diff --git a/lapack-netlib/SRC/slasda.c b/lapack-netlib/SRC/slasda.c
index 71424c3f1..1d336d1ce 100644
--- a/lapack-netlib/SRC/slasda.c
+++ b/lapack-netlib/SRC/slasda.c
@@ -1023,7 +1023,7 @@ f"> */
 
 /*     Now conquer each subproblem bottom-up. */
 
-    j = pow_ii(&c__2, &nlvl);
+    j = pow_ii(c__2, nlvl);
     for (lvl = nlvl; lvl >= 1; --lvl) {
 	lvl2 = (lvl << 1) - 1;
 
@@ -1035,7 +1035,7 @@ f"> */
 	    ll = 1;
 	} else {
 	    i__1 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__1);
+	    lf = pow_ii(c__2, i__1);
 	    ll = (lf << 1) - 1;
 	}
 	i__1 = ll;
diff --git a/lapack-netlib/SRC/sstedc.c b/lapack-netlib/SRC/sstedc.c
index 46ed15a1a..61ad3dd37 100644
--- a/lapack-netlib/SRC/sstedc.c
+++ b/lapack-netlib/SRC/sstedc.c
@@ -804,10 +804,10 @@ f"> */
 	    lwmin = *n - 1 << 1;
 	} else {
 	    lgn = (integer) (log((real) (*n)) / log(2.f));
-	    if (pow_ii(&c__2, &lgn) < *n) {
+	    if (pow_ii(c__2, lgn) < *n) {
 		++lgn;
 	    }
-	    if (pow_ii(&c__2, &lgn) < *n) {
+	    if (pow_ii(c__2, lgn) < *n) {
 		++lgn;
 	    }
 	    if (icompz == 1) {
diff --git a/lapack-netlib/SRC/zlaed0.c b/lapack-netlib/SRC/zlaed0.c
index 37bd12b01..2b25f6e4e 100644
--- a/lapack-netlib/SRC/zlaed0.c
+++ b/lapack-netlib/SRC/zlaed0.c
@@ -793,10 +793,10 @@ L10:
 
     temp = log((doublereal) (*n)) / log(2.);
     lgn = (integer) temp;
-    if (pow_ii(&c__2, &lgn) < *n) {
+    if (pow_ii(c__2, lgn) < *n) {
 	++lgn;
     }
-    if (pow_ii(&c__2, &lgn) < *n) {
+    if (pow_ii(c__2, lgn) < *n) {
 	++lgn;
     }
     iprmpt = indxq + *n + 1;
diff --git a/lapack-netlib/SRC/zlaed7.c b/lapack-netlib/SRC/zlaed7.c
index 093051917..8665ee12c 100644
--- a/lapack-netlib/SRC/zlaed7.c
+++ b/lapack-netlib/SRC/zlaed7.c
@@ -864,11 +864,11 @@ f"> */
 /*     Form the z-vector which consists of the last row of Q_1 and the */
 /*     first row of Q_2. */
 
-    ptr = pow_ii(&c__2, tlvls) + 1;
+    ptr = pow_ii(c__2, *tlvls) + 1;
     i__1 = *curlvl - 1;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	i__2 = *tlvls - i__;
-	ptr += pow_ii(&c__2, &i__2);
+	ptr += pow_ii(c__2, i__2);
 /* L10: */
     }
     curr = ptr + *curpbm;
diff --git a/lapack-netlib/SRC/zlalsa.c b/lapack-netlib/SRC/zlalsa.c
index d17016e7d..cd0819c3d 100644
--- a/lapack-netlib/SRC/zlalsa.c
+++ b/lapack-netlib/SRC/zlalsa.c
@@ -1051,7 +1051,7 @@ f"> */
 /*     Finally go through the left singular vector matrices of all */
 /*     the other subproblems bottom-up on the tree. */
 
-    j = pow_ii(&c__2, &nlvl);
+    j = pow_ii(c__2, nlvl);
     sqre = 0;
 
     for (lvl = nlvl; lvl >= 1; --lvl) {
@@ -1065,7 +1065,7 @@ f"> */
 	    ll = 1;
 	} else {
 	    i__1 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__1);
+	    lf = pow_ii(c__2, i__1);
 	    ll = (lf << 1) - 1;
 	}
 	i__1 = ll;
@@ -1110,7 +1110,7 @@ L170:
 	    ll = 1;
 	} else {
 	    i__2 = lvl - 1;
-	    lf = pow_ii(&c__2, &i__2);
+	    lf = pow_ii(c__2, i__2);
 	    ll = (lf << 1) - 1;
 	}
 	i__2 = lf;
diff --git a/lapack-netlib/SRC/zstedc.c b/lapack-netlib/SRC/zstedc.c
index 4cfc41840..55baba2d7 100644
--- a/lapack-netlib/SRC/zstedc.c
+++ b/lapack-netlib/SRC/zstedc.c
@@ -836,10 +836,10 @@ f"> */
 	    lrwmin = *n - 1 << 1;
 	} else if (icompz == 1) {
 	    lgn = (integer) (log((doublereal) (*n)) / log(2.));
-	    if (pow_ii(&c__2, &lgn) < *n) {
+	    if (pow_ii(c__2, lgn) < *n) {
 		++lgn;
 	    }
-	    if (pow_ii(&c__2, &lgn) < *n) {
+	    if (pow_ii(c__2, lgn) < *n) {
 		++lgn;
 	    }
 	    lwmin = *n * *n;

From 91110f92d218492d0efbdc1fdf34277ca45f4b36 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 14 Sep 2022 14:03:31 +0200
Subject: [PATCH 016/154] fix missing return type in function declaration

---
 ctest/c_sblat1c.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c
index 4993d31bb..57e4707a9 100644
--- a/ctest/c_sblat1c.c
+++ b/ctest/c_sblat1c.c
@@ -969,7 +969,7 @@ real *sfac;
 	    1.17 };
 
     /* Local variables */
-    extern /* Subroutine */ srottest_();
+    extern /* Subroutine */ void srottest_();
     static integer i__, k, ksize;
     extern /* Subroutine */ int stest_(), srotmtest_();
     static integer ki, kn;

From 9773a9d6b3da46a8c499d0dcc22030641006fa7b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 14 Sep 2022 17:04:11 +0200
Subject: [PATCH 017/154] undefine YIELDING for the Emscripten js converter

---
 common.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/common.h b/common.h
index e6002d322..4eeeb8d55 100644
--- a/common.h
+++ b/common.h
@@ -387,6 +387,10 @@ typedef int blasint;
 #endif
 */
 
+#ifdef __EMSCRIPTEN__
+#define YIELDING
+#endif
+
 #ifndef YIELDING
 #define YIELDING	sched_yield()
 #endif

From b285307e184f8ff2a3e430442756c735a0243671 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 14 Sep 2022 17:05:24 +0200
Subject: [PATCH 018/154] Add a kludge for the Emscripten js converter

---
 ctest.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ctest.c b/ctest.c
index df628b1d4..2ccae8dcc 100644
--- a/ctest.c
+++ b/ctest.c
@@ -173,3 +173,8 @@ HAVE_C11
 ARCH_E2K
 #endif
 
+#if defined(__EMSCRIPTEN__)
+ARCH_RISCV64
+OS_WINDOWS
+#endif
+

From 9402df5604e69f86f58953e3883f33f98c930baf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 14 Sep 2022 21:44:34 +0200
Subject: [PATCH 019/154] Fix missing external declaration

---
 driver/others/blas_server_omp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index 1a5fd06a3..c158f92ee 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -69,6 +69,8 @@
 
 int blas_server_avail = 0;
 
+extern int openblas_omp_adaptive_env();
+
 static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
 #ifdef HAVE_C11
 static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];

From 101a2c77c3f3610933f450cefca3e312edab2186 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 15 Sep 2022 09:19:19 +0200
Subject: [PATCH 020/154] Fix warnings

---
 kernel/x86_64/dgemm_ncopy_8_skylakex.c | 24 ++++++++++++------------
 kernel/x86_64/omatcopy_rt.c            |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/kernel/x86_64/dgemm_ncopy_8_skylakex.c b/kernel/x86_64/dgemm_ncopy_8_skylakex.c
index 74b336f3d..874ef68d6 100644
--- a/kernel/x86_64/dgemm_ncopy_8_skylakex.c
+++ b/kernel/x86_64/dgemm_ncopy_8_skylakex.c
@@ -52,18 +52,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
   FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
   FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
   FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
-  FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
-  FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
-  FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
-  FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
-  FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
-  FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
-  FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
-  FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
-  FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
-  FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
-  FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
-  FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
+  FLOAT ctemp17 /*, ctemp18, ctemp19, ctemp20*/ ;
+  FLOAT /*ctemp21, ctemp22,*/ ctemp23, ctemp24;
+  FLOAT ctemp25 /*, ctemp26, ctemp27, ctemp28*/ ;
+  FLOAT /*ctemp29, ctemp30,*/ ctemp31, ctemp32;
+  FLOAT ctemp33 /*, ctemp34, ctemp35, ctemp36*/ ;
+  FLOAT /*ctemp37, ctemp38,*/ ctemp39, ctemp40;
+  FLOAT ctemp41 /*, ctemp42, ctemp43, ctemp44*/ ;
+  FLOAT /*ctemp45, ctemp46,*/ ctemp47, ctemp48;
+  FLOAT ctemp49 /*, ctemp50, ctemp51, ctemp52*/ ;
+  FLOAT /*ctemp53, ctemp54,*/ ctemp55, ctemp56;
+  FLOAT ctemp57 /*, ctemp58, ctemp59, ctemp60*/ ;
+  FLOAT /*ctemp61, ctemp62,*/ ctemp63, ctemp64;
 
 
   aoffset = a;
diff --git a/kernel/x86_64/omatcopy_rt.c b/kernel/x86_64/omatcopy_rt.c
index e695f00c5..b11893f5d 100644
--- a/kernel/x86_64/omatcopy_rt.c
+++ b/kernel/x86_64/omatcopy_rt.c
@@ -142,7 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
 }
 int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){
-  float *src, *dst, *dst_tmp, *src_base, *dst_base;
+  float *src, *dst, *dst_tmp=0, *src_base, *dst_base;
   uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0;
   BLASLONG cols_left, rows_done; float ALPHA = alpha;
   if(ALPHA==0.0){

From 548a11b9d9aa7e5298f6a9092d917255d6f21644 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Fri, 16 Sep 2022 09:19:54 +0800
Subject: [PATCH 021/154] [WIP,Testing]: Add test for mips64

---
 .github/workflows/mips64.yml | 114 +++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 .github/workflows/mips64.yml

diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml
new file mode 100644
index 000000000..a5bd7b84b
--- /dev/null
+++ b/.github/workflows/mips64.yml
@@ -0,0 +1,114 @@
+name: mips64 qemu test
+
+on: [push, pull_request]
+
+jobs:
+  TEST:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: MIPS64_GENERIC
+            triple: mips64el-linux-gnuabi64
+            opts: NO_SHARED=1 TARGET=MIPS64_GENERIC
+          - target: SICORTEX
+            triple: mips64el-linux-gnuabi64
+            opts: NO_SHARED=1 TARGET=SICORTEX
+          - target: I6400
+            triple: mipsisa64r6el-linux-gnuabi64
+            opts: NO_SHARED=1 TARGET=I6400
+          - target: P6600
+            triple: mipsisa64r6el-linux-gnuabi64
+            opts: NO_SHARED=1 TARGET=P6600
+          - target: I6500
+            triple: mipsisa64r6el-linux-gnuabi64
+            opts: NO_SHARED=1 TARGET=I6500
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: install build deps
+        run: |
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
+          gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
+
+      - name: checkout qemu
+        uses: actions/checkout@v3
+        with:
+          repository: qemu/qemu
+          path: qemu
+          ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
+
+      - name: build qemu
+        run: |
+          cd qemu
+          ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
+          make -j$(nproc)
+          make install
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: build OpenBLAS
+        run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
+
+      - name: test
+        run: |
+          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
+          qemu-mips64el ./utest/openblas_utest
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat

From edea1bcfafef8aab3528fd1859df229fde47a913 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Sat, 17 Sep 2022 16:39:30 +0800
Subject: [PATCH 022/154] MIPS64: Fixed failed utest dsdot:dsdot_n_1 when
 TARGET=I6500

---
 kernel/mips/sdot_msa.c | 151 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)

diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c
index e02e10c61..8c250d401 100644
--- a/kernel/mips/sdot_msa.c
+++ b/kernel/mips/sdot_msa.c
@@ -39,10 +39,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
     FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
     v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
     v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
+#if defined(DSDOT)
+    v2f64 dvx0, dvx1, dvx2, dvx3, dvx4, dvx5, dvx6, dvx7;
+    v2f64 dvy0, dvy1, dvy2, dvy3, dvy4, dvy5, dvy6, dvy7;
+    v2f64 dot0 = {0, 0};
+    v2f64 dot1 = {0, 0};
+    v2f64 dot2 = {0, 0};
+    v2f64 dot3 = {0, 0};
+#else
     v4f32 dot0 = {0, 0, 0, 0};
     v4f32 dot1 = {0, 0, 0, 0};
     v4f32 dot2 = {0, 0, 0, 0};
     v4f32 dot3 = {0, 0, 0, 0};
+#endif
 
     if (n < 1) return (dot);
 
@@ -83,6 +92,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             x_pref += 32;
             y_pref += 32;
 
+#if defined(DSDOT)
+            /* Extend single precision to double precision */
+            dvy0 = __msa_fexupr_d(vy0);
+            dvy1 = __msa_fexupr_d(vy1);
+            dvy2 = __msa_fexupr_d(vy2);
+            dvy3 = __msa_fexupr_d(vy3);
+            dvy4 = __msa_fexupr_d(vy4);
+            dvy5 = __msa_fexupr_d(vy5);
+            dvy6 = __msa_fexupr_d(vy6);
+            dvy7 = __msa_fexupr_d(vy7);
+
+            vy0 = (v4f32)__msa_fexupl_d(vy0);
+            vy1 = (v4f32)__msa_fexupl_d(vy1);
+            vy2 = (v4f32)__msa_fexupl_d(vy2);
+            vy3 = (v4f32)__msa_fexupl_d(vy3);
+            vy4 = (v4f32)__msa_fexupl_d(vy4);
+            vy5 = (v4f32)__msa_fexupl_d(vy5);
+            vy6 = (v4f32)__msa_fexupl_d(vy6);
+            vy7 = (v4f32)__msa_fexupl_d(vy7);
+
+            dvx0 = __msa_fexupr_d(vx0);
+            dvx1 = __msa_fexupr_d(vx1);
+            dvx2 = __msa_fexupr_d(vx2);
+            dvx3 = __msa_fexupr_d(vx3);
+            dvx4 = __msa_fexupr_d(vx4);
+            dvx5 = __msa_fexupr_d(vx5);
+            dvx6 = __msa_fexupr_d(vx6);
+            dvx7 = __msa_fexupr_d(vx7);
+
+            vx0 = (v4f32)__msa_fexupl_d(vx0);
+            vx1 = (v4f32)__msa_fexupl_d(vx1);
+            vx2 = (v4f32)__msa_fexupl_d(vx2);
+            vx3 = (v4f32)__msa_fexupl_d(vx3);
+            vx4 = (v4f32)__msa_fexupl_d(vx4);
+            vx5 = (v4f32)__msa_fexupl_d(vx5);
+            vx6 = (v4f32)__msa_fexupl_d(vx6);
+            vx7 = (v4f32)__msa_fexupl_d(vx7);
+
+            dot0 += (dvy0 * dvx0);
+            dot1 += (dvy1 * dvx1);
+            dot2 += (dvy2 * dvx2);
+            dot3 += (dvy3 * dvx3);
+            dot0 += (dvy4 * dvx4);
+            dot1 += (dvy5 * dvx5);
+            dot2 += (dvy6 * dvx6);
+            dot3 += (dvy7 * dvx7);
+            dot0 += ((v2f64)vy0 * (v2f64)vx0);
+            dot1 += ((v2f64)vy1 * (v2f64)vx1);
+            dot2 += ((v2f64)vy2 * (v2f64)vx2);
+            dot3 += ((v2f64)vy3 * (v2f64)vx3);
+            dot0 += ((v2f64)vy4 * (v2f64)vx4);
+            dot1 += ((v2f64)vy5 * (v2f64)vx5);
+            dot2 += ((v2f64)vy6 * (v2f64)vx6);
+            dot3 += ((v2f64)vy7 * (v2f64)vx7);
+#else
             dot0 += (vy0 * vx0);
             dot1 += (vy1 * vx1);
             dot2 += (vy2 * vx2);
@@ -91,6 +155,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             dot1 += (vy5 * vx5);
             dot2 += (vy6 * vx6);
             dot3 += (vy7 * vx7);
+#endif
         }
 
         if (n & 31)
@@ -100,10 +165,41 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
                 LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
 
+#if defined(DSDOT)
+                dvy0 = __msa_fexupr_d(vy0);
+                dvy1 = __msa_fexupr_d(vy1);
+                dvy2 = __msa_fexupr_d(vy2);
+                dvy3 = __msa_fexupr_d(vy3);
+
+                vy0 = (v4f32)__msa_fexupl_d(vy0);
+                vy1 = (v4f32)__msa_fexupl_d(vy1);
+                vy2 = (v4f32)__msa_fexupl_d(vy2);
+                vy3 = (v4f32)__msa_fexupl_d(vy3);
+
+                dvx0 = __msa_fexupr_d(vx0);
+                dvx1 = __msa_fexupr_d(vx1);
+                dvx2 = __msa_fexupr_d(vx2);
+                dvx3 = __msa_fexupr_d(vx3);
+
+                vx0 = (v4f32)__msa_fexupl_d(vx0);
+                vx1 = (v4f32)__msa_fexupl_d(vx1);
+                vx2 = (v4f32)__msa_fexupl_d(vx2);
+                vx3 = (v4f32)__msa_fexupl_d(vx3);
+
+                dot0 += (dvy0 * dvx0);
+                dot1 += (dvy1 * dvx1);
+                dot2 += (dvy2 * dvx2);
+                dot3 += (dvy3 * dvx3);
+                dot0 += ((v2f64)vy0 * (v2f64)vx0);
+                dot1 += ((v2f64)vy1 * (v2f64)vx1);
+                dot2 += ((v2f64)vy2 * (v2f64)vx2);
+                dot3 += ((v2f64)vy3 * (v2f64)vx3);
+#else
                 dot0 += (vy0 * vx0);
                 dot1 += (vy1 * vx1);
                 dot2 += (vy2 * vx2);
                 dot3 += (vy3 * vx3);
+#endif
             }
 
             if (n & 8)
@@ -111,8 +207,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 LD_SP2_INC(x, 4, vx0, vx1);
                 LD_SP2_INC(y, 4, vy0, vy1);
 
+#if defined(DSDOT)
+                dvy0 = __msa_fexupr_d(vy0);
+                dvy1 = __msa_fexupr_d(vy1);
+
+                vy0 = (v4f32)__msa_fexupl_d(vy0);
+                vy1 = (v4f32)__msa_fexupl_d(vy1);
+
+                dvx0 = __msa_fexupr_d(vx0);
+                dvx1 = __msa_fexupr_d(vx1);
+
+                vx0 = (v4f32)__msa_fexupl_d(vx0);
+                vx1 = (v4f32)__msa_fexupl_d(vx1);
+
+                dot0 += (dvy0 * dvx0);
+                dot1 += (dvy1 * dvx1);
+                dot0 += ((v2f64)vy0 * (v2f64)vx0);
+                dot1 += ((v2f64)vy1 * (v2f64)vx1);
+#else
                 dot0 += (vy0 * vx0);
                 dot1 += (vy1 * vx1);
+#endif
             }
 
             if (n & 4)
@@ -120,7 +235,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 vx0 = LD_SP(x); x += 4;
                 vy0 = LD_SP(y); y += 4;
 
+#if defined(DSDOT)
+                dvy0 = __msa_fexupr_d(vy0);
+                vy0 = (v4f32)__msa_fexupl_d(vy0);
+                dvx0 = __msa_fexupr_d(vx0);
+                vx0 = (v4f32)__msa_fexupl_d(vx0);
+                dot0 += (dvy0 * dvx0);
+                dot0 += ((v2f64)vy0 * (v2f64)vx0);
+#else
                 dot0 += (vy0 * vx0);
+#endif
             }
 
             if (n & 2)
@@ -128,8 +252,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 LD_GP2_INC(x, 1, x0, x1);
                 LD_GP2_INC(y, 1, y0, y1);
 
+#if defined(DSDOT)
+                dot += ((double)y0 * (double)x0);
+                dot += ((double)y1 * (double)x1);
+#else
                 dot += (y0 * x0);
                 dot += (y1 * x1);
+#endif
             }
 
             if (n & 1)
@@ -137,7 +266,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 x0 = *x;
                 y0 = *y;
 
+#if defined(DSDOT)
+                dot += ((double)y0 * (double)x0);
+#else
                 dot += (y0 * x0);
+#endif
             }
         }
 
@@ -145,8 +278,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 
         dot += dot0[0];
         dot += dot0[1];
+#if !defined(DSDOT)
         dot += dot0[2];
         dot += dot0[3];
+#endif
     }
     else
     {
@@ -155,10 +290,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
             LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
 
+#if defined(DSDOT)
+            dot += ((double)y0 * (double)x0);
+            dot += ((double)y1 * (double)x1);
+            dot += ((double)y2 * (double)x2);
+            dot += ((double)y3 * (double)x3);
+#else
             dot += (y0 * x0);
             dot += (y1 * x1);
             dot += (y2 * x2);
             dot += (y3 * x3);
+#endif
         }
 
         if (n & 2)
@@ -166,8 +308,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             LD_GP2_INC(x, inc_x, x0, x1);
             LD_GP2_INC(y, inc_y, y0, y1);
 
+#if defined(DSDOT)
+            dot += ((double)y0 * (double)x0);
+            dot += ((double)y1 * (double)x1);
+#else
             dot += (y0 * x0);
             dot += (y1 * x1);
+#endif
         }
 
         if (n & 1)
@@ -175,7 +322,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
             x0 = *x;
             y0 = *y;
 
+#if defined(DSDOT)
+            dot += ((double)y0 * (double)x0);
+#else
             dot += (y0 * x0);
+#endif
         }
     }
 

From b1d69fb3ac429f64fcb18b2ef4283f1701d67aa2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 17 Sep 2022 23:52:32 +0200
Subject: [PATCH 023/154] Add MIPS64_GENERIC as a copy of GENERIC

---
 kernel/mips64/KERNEL.MIPS64_GENERIC | 160 ++++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 kernel/mips64/KERNEL.MIPS64_GENERIC

diff --git a/kernel/mips64/KERNEL.MIPS64_GENERIC b/kernel/mips64/KERNEL.MIPS64_GENERIC
new file mode 100644
index 000000000..17f2ef976
--- /dev/null
+++ b/kernel/mips64/KERNEL.MIPS64_GENERIC
@@ -0,0 +1,160 @@
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+#Pure C for other kernels
+SAMAXKERNEL  = ../mips/amax.c
+DAMAXKERNEL  = ../mips/amax.c
+CAMAXKERNEL  = ../mips/zamax.c
+ZAMAXKERNEL  = ../mips/zamax.c
+
+SAMINKERNEL  = ../mips/amin.c
+DAMINKERNEL  = ../mips/amin.c
+CAMINKERNEL  = ../mips/zamin.c
+ZAMINKERNEL  = ../mips/zamin.c
+
+SMAXKERNEL   = ../mips/max.c
+DMAXKERNEL   = ../mips/max.c
+
+SMINKERNEL   = ../mips/min.c
+DMINKERNEL   = ../mips/min.c
+
+ISAMAXKERNEL = ../mips/iamax.c
+IDAMAXKERNEL = ../mips/iamax.c
+ICAMAXKERNEL = ../mips/izamax.c
+IZAMAXKERNEL = ../mips/izamax.c
+
+ISAMINKERNEL = ../mips/iamin.c
+IDAMINKERNEL = ../mips/iamin.c
+ICAMINKERNEL = ../mips/izamin.c
+IZAMINKERNEL = ../mips/izamin.c
+
+ISMAXKERNEL  = ../mips/imax.c
+IDMAXKERNEL  = ../mips/imax.c
+
+ISMINKERNEL  = ../mips/imin.c
+IDMINKERNEL  = ../mips/imin.c
+
+SASUMKERNEL  = ../mips/asum.c
+DASUMKERNEL  = ../mips/asum.c
+CASUMKERNEL  = ../mips/zasum.c
+ZASUMKERNEL  = ../mips/zasum.c
+
+SSUMKERNEL  = ../mips/sum.c
+DSUMKERNEL  = ../mips/sum.c
+CSUMKERNEL  = ../mips/zsum.c
+ZSUMKERNEL  = ../mips/zsum.c
+
+SAXPYKERNEL  = ../mips/axpy.c
+DAXPYKERNEL  = ../mips/axpy.c
+CAXPYKERNEL  = ../mips/zaxpy.c
+ZAXPYKERNEL  = ../mips/zaxpy.c
+
+SCOPYKERNEL  = ../mips/copy.c
+DCOPYKERNEL  = ../mips/copy.c
+CCOPYKERNEL  = ../mips/zcopy.c
+ZCOPYKERNEL  = ../mips/zcopy.c
+
+SDOTKERNEL   = ../mips/dot.c
+DDOTKERNEL   = ../mips/dot.c
+CDOTKERNEL   = ../mips/zdot.c
+ZDOTKERNEL   = ../mips/zdot.c
+
+SNRM2KERNEL  = ../mips/nrm2.c
+DNRM2KERNEL  = ../mips/nrm2.c
+CNRM2KERNEL  = ../mips/znrm2.c
+ZNRM2KERNEL  = ../mips/znrm2.c
+
+SROTKERNEL   = ../mips/rot.c
+DROTKERNEL   = ../mips/rot.c
+CROTKERNEL   = ../mips/zrot.c
+ZROTKERNEL   = ../mips/zrot.c
+
+SSCALKERNEL  = ../mips/scal.c
+DSCALKERNEL  = ../mips/scal.c
+CSCALKERNEL  = ../mips/zscal.c
+ZSCALKERNEL  = ../mips/zscal.c
+
+SSWAPKERNEL  = ../mips/swap.c
+DSWAPKERNEL  = ../mips/swap.c
+CSWAPKERNEL  = ../mips/zswap.c
+ZSWAPKERNEL  = ../mips/zswap.c
+
+SGEMVNKERNEL = ../mips/gemv_n.c
+DGEMVNKERNEL = ../mips/gemv_n.c
+CGEMVNKERNEL = ../mips/zgemv_n.c
+ZGEMVNKERNEL = ../mips/zgemv_n.c
+
+SGEMVTKERNEL = ../mips/gemv_t.c
+DGEMVTKERNEL = ../mips/gemv_t.c
+CGEMVTKERNEL = ../mips/zgemv_t.c
+ZGEMVTKERNEL = ../mips/zgemv_t.c
+
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+QSYMV_U_KERNEL =  ../generic/symv_k.c
+QSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+XSYMV_U_KERNEL =  ../generic/zsymv_k.c
+XSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
+ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
+
+CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c

From 84453b924fe7695029cad974dfe0cf7bf6ffe0f6 Mon Sep 17 00:00:00 2001
From: "Kai T. Ohlhus" <k.ohlhus@gmail.com>
Date: Thu, 22 Sep 2022 00:20:40 +0900
Subject: [PATCH 024/154] Support CONSISTENT_FPCSR on AARCH64

---
 driver/others/blas_server.c     | 8 ++++++++
 driver/others/blas_server_omp.c | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index 9cfd825ec..051513f27 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -470,9 +470,13 @@ blas_queue_t *tscq;
 #endif
 
 #ifdef CONSISTENT_FPCSR
+#ifdef __aarch64__
+      __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
+#else
       __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
       __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
 #endif
+#endif
 
 #ifdef MONITOR
       main_status[cpu] = MAIN_RUNNING1;
@@ -746,9 +750,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
       queue -> position  = pos;
 
 #ifdef CONSISTENT_FPCSR
+#ifdef __aarch64__
+      __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode));
+#else
       __asm__ __volatile__ ("fnstcw %0"  : "=m" (queue -> x87_mode));
       __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
 #endif
+#endif
 
 #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
 
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index c158f92ee..e06ab8404 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -284,8 +284,12 @@ static void exec_threads(blas_queue_t *queue, int buf_index){
   sb = queue -> sb;
 
 #ifdef CONSISTENT_FPCSR
+#ifdef __aarch64__
+  __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
+#else
   __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
   __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
+#endif
 #endif
 
   if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
@@ -383,8 +387,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
 
 #ifdef CONSISTENT_FPCSR
   for (i = 0; i < num; i ++) {
+#ifdef __aarch64__
+    __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode));
+#else
     __asm__ __volatile__ ("fnstcw %0"  : "=m" (queue[i].x87_mode));
     __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode));
+#endif
   }
 #endif
 

From c2892f0e31d41f5e8d6c1324c6592459c19b4c59 Mon Sep 17 00:00:00 2001
From: "Kai T. Ohlhus" <k.ohlhus@gmail.com>
Date: Thu, 22 Sep 2022 00:25:13 +0900
Subject: [PATCH 025/154] Makefile.rule: update CONSISTENT_FPCSR documentation

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 359672359..a0ad90a68 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -207,7 +207,7 @@ NO_AFFINITY = 1
 # to the user space. If bigphysarea is enabled, it will use it.
 # DEVICEDRIVER_ALLOCATION = 1
 
-# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
+# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only).
 # CONSISTENT_FPCSR = 1
 
 # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute

From 11cd1080958dc17c36857f1a6d5d9e705f144440 Mon Sep 17 00:00:00 2001
From: Alex <aleksandrosansan@gmail.com>
Date: Mon, 26 Sep 2022 13:46:34 +0200
Subject: [PATCH 026/154] build: harden nightly-Homebrew-build.yml permissions

Signed-off-by: Alex <aleksandrosansan@gmail.com>
---
 .github/workflows/nightly-Homebrew-build.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml
index 29ec96f73..37ffe9e83 100644
--- a/.github/workflows/nightly-Homebrew-build.yml
+++ b/.github/workflows/nightly-Homebrew-build.yml
@@ -17,6 +17,10 @@ on:
 # it only makes sense to test if this file has been changed
 
 name: Nightly-Homebrew-Build
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
 jobs:
   build-OpenBLAS-with-Homebrew:
     runs-on: macos-latest

From 4de8e1b8f922e531e9c49d8deb35fef993d17ee4 Mon Sep 17 00:00:00 2001
From: Alex <aleksandrosansan@gmail.com>
Date: Mon, 26 Sep 2022 13:47:15 +0200
Subject: [PATCH 027/154] build: harden mips64.yml permissions

Signed-off-by: Alex <aleksandrosansan@gmail.com>
---
 .github/workflows/mips64.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml
index a5bd7b84b..de7c0c0f3 100644
--- a/.github/workflows/mips64.yml
+++ b/.github/workflows/mips64.yml
@@ -2,6 +2,9 @@ name: mips64 qemu test
 
 on: [push, pull_request]
 
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
 jobs:
   TEST:
     runs-on: ubuntu-latest

From c726604319a038a7558d638985bbb60ac4983285 Mon Sep 17 00:00:00 2001
From: Alex <aleksandrosansan@gmail.com>
Date: Mon, 26 Sep 2022 13:48:11 +0200
Subject: [PATCH 028/154] build: harden dynamic_arch.yml permissions

Signed-off-by: Alex <aleksandrosansan@gmail.com>
---
 .github/workflows/dynamic_arch.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index 138a853dd..49139317c 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -2,6 +2,9 @@ name: continuous build
 
 on: [push, pull_request]
 
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
 jobs:
   build:
     runs-on: ${{ matrix.os }}

From f6f35a4288947091e51dda427537ecfb202ec904 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 29 Sep 2022 08:47:14 +0200
Subject: [PATCH 029/154] fix copyobj declarations to work with DYNAMIC_ARCH

---
 kernel/mips64/KERNEL.MIPS64_GENERIC | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/mips64/KERNEL.MIPS64_GENERIC b/kernel/mips64/KERNEL.MIPS64_GENERIC
index 17f2ef976..33bcbeedd 100644
--- a/kernel/mips64/KERNEL.MIPS64_GENERIC
+++ b/kernel/mips64/KERNEL.MIPS64_GENERIC
@@ -11,26 +11,26 @@ ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
 SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
 SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
 DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c

From d2ce93179f6747380488db2a56102dab6fde18ca Mon Sep 17 00:00:00 2001
From: Elliot Saba <staticfloat@gmail.com>
Date: Thu, 22 Sep 2022 10:38:36 -0700
Subject: [PATCH 030/154] Add `OPENBLAS_DEFAULT_NUM_THREADS`

This allows Julia to set a default number of threads (usually `1`) to be
used when no other thread counts are specified [0], to short-circuit the
default OpenBLAS thread initialization routine that spins up a different
number of threads than Julia would otherwise choose.

The reason to add a new environment variable is that we want to be able
to configure OpenBLAS to avoid performing its initial memory
allocation/thread startup, as that can consume significant amounts of
memory, but we still want to be sensitive to legacy codebases that set
things like `OMP_NUM_THREADS` or `GOTOBLAS_NUM_THREADS`.  Creating a new
environment variable that is openblas-specific and is not already
publicly used to control the overall number of threads of programs like
Julia seems to be the best way forward.

[0] https://github.com/JuliaLang/julia/pull/46844
---
 driver/others/init.c         | 2 ++
 driver/others/openblas_env.c | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/driver/others/init.c b/driver/others/init.c
index cc3145a62..cd10e8d36 100644
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -823,6 +823,8 @@ void gotoblas_affinity_init(void) {
 
   if (numprocs == 0) numprocs = readenv_atoi("OMP_NUM_THREADS");
 
+  if (numprocs == 0) numprocs = readenv_atoi("OPENBLAS_DEFAULT_NUM_THREADS");
+
   numnodes = 1;
 
   if (numprocs == 1) {
diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c
index ef91a08e6..35b2270d4 100644
--- a/driver/others/openblas_env.c
+++ b/driver/others/openblas_env.c
@@ -67,10 +67,16 @@ void openblas_read_env() {
   openblas_env_thread_timeout=(unsigned int)ret;
 
   ret=0;
-  if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
+  if (readenv(p,"OPENBLAS_DEFAULT_NUM_THREADS")) ret = atoi(p);
   if(ret<0) ret=0;
   openblas_env_openblas_num_threads=ret;
 
+  ret=0;
+  if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
+  if(ret<0) ret=0;
+  if(ret != 0 || openblas_env_openblas_num_threads == 0)
+    openblas_env_openblas_num_threads=ret;
+
   ret=0;
   if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p);
   if(ret<0) ret=0;

From 5e78493d956093413142064184c273615e44da0b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 30 Sep 2022 11:55:56 +0200
Subject: [PATCH 031/154] Move Cray case after GNU as Cray builds of gfortran
 have both names in the version string

---
 f_check | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/f_check b/f_check
index bb13e1640..02e078b8a 100755
--- a/f_check
+++ b/f_check
@@ -82,10 +82,6 @@ else
                 vendor=FUJITSU
                 openmp='-Kopenmp'
                 ;;
-	    *Cray*)
-		vendor=CRAY
-		openmp='-fopenmp'
-		;;
             *GNU*|*GCC*)
 
                 v="${data#*GCC: *\) }"
@@ -117,6 +113,10 @@ else
                     esac
                 fi
                 ;;
+	    *Cray*)
+		vendor=CRAY
+		openmp='-fopenmp'
+		;;		
             *g95*)
                 vendor=G95
                 openmp=''

From 79d842047ad1abd4f0d2ea9e4794564916db8041 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 30 Sep 2022 11:58:15 +0200
Subject: [PATCH 032/154] Move Cray case after GNU as Cray builds of gfortran
 have both names in the version string

---
 f_check.pl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/f_check.pl b/f_check.pl
index cfc7331c2..f093b9ad5 100644
--- a/f_check.pl
+++ b/f_check.pl
@@ -76,11 +76,6 @@ if ($compiler eq "") {
 	    $vendor = FUJITSU;
 	    $openmp = "-Kopenmp";
 
-	} elsif ($data =~ /Cray/) {
-
-	    $vendor = CRAY;
-	    $openmp = "-fopenmp";
-
 	} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
 
             $data =~ s/\(+.*?\)+//g;
@@ -106,6 +101,10 @@ if ($compiler eq "") {
 		    $openmp = "";
 		}
 	    }
+	} elsif ($data =~ /Cray/) {
+
+	    $vendor = CRAY;
+	    $openmp = "-fopenmp";
 
 	}
 

From db50ab4a720b45ce356a6f91b2c452c84e1e5a93 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 1 Oct 2022 15:14:51 +0200
Subject: [PATCH 033/154] Add BUILD_vartype defines

---
 driver/others/blas_server_win32.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index 33b58f134..afa33cccc 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -278,12 +278,15 @@ static DWORD WINAPI blas_thread_server(void *arg){
 	  } else
 #endif
 	    if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
+#ifdef BUILD_DOUBLE
 	      sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
 					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
-
+#endif
 	    } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
+#ifdef BUILD_SINGLE
 	      sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
 					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+#endif
 	    } else {
             /* Other types in future */
 	    }
@@ -295,11 +298,15 @@ static DWORD WINAPI blas_thread_server(void *arg){
 	  } else
 #endif
 	    if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
+#ifdef BUILD_COMPLEX16
 	      sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
 					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+#endif
 	    } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
+#ifdef BUILD_COMPLEX
 	      sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
 					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+#endif
 	    } else {
             /* Other types in future */
 	    }

From 57809526c430ef0a07f5c5c39dce20c6d73a1f35 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Oct 2022 09:12:23 +0200
Subject: [PATCH 034/154] Disable the gfortran tree vectorizer for
 lapack-netlib

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 289f0eca5..56af9847e 100644
--- a/Makefile
+++ b/Makefile
@@ -278,7 +278,11 @@ prof_lapack : lapack_prebuild
 lapack_prebuild :
 ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK)))
 	-@echo "FC          = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
+ifeq ($(F_COMPILER), GFORTRAN)
+	-@echo "override FFLAGS      = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc
+else
 	-@echo "override FFLAGS      = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+endif
 	-@echo "FFLAGS_DRV  = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "FFLAGS_NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc

From 32566bfb44067e0c0459e94b53c9457613539eeb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 13 Oct 2022 14:04:25 +0200
Subject: [PATCH 035/154] Disable the gfortran tree vectorizer for netlib
 LAPACK

---
 cmake/lapack.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index f8a27f5d4..3b221d420 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -999,6 +999,9 @@ endforeach ()
 
 if (NOT C_LAPACK)
   set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
+  if (${F_COMPILER} STREQUAL "GFORTRAN")
+    set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize")
+  endif()
 else ()
   set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
 endif ()

From b2523471c9e6a398f0950952c376e136398a1cfe Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 18 Oct 2022 16:16:26 +0200
Subject: [PATCH 036/154] Add libsuffix support

---
 openblas.pc.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openblas.pc.in b/openblas.pc.in
index ff849807c..8ad6e8bee 100644
--- a/openblas.pc.in
+++ b/openblas.pc.in
@@ -2,6 +2,6 @@ Name: openblas
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: ${version}
 URL: https://github.com/xianyi/OpenBLAS
-Libs: -L${libdir} -lopenblas
+Libs: -L${libdir} -lopenblas${libsuffix}
 Libs.private: ${extralib}
 Cflags: -I${includedir}

From 8bacea125426d6f8a01d604962217c4cd837f699 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 18 Oct 2022 16:18:29 +0200
Subject: [PATCH 037/154] Pass libsuffix to openblas.pc and fix passing of
 INTERFACE64/USE64BITINT flag

---
 Makefile.install | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Makefile.install b/Makefile.install
index 28727de37..adef0b5f4 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -152,8 +152,9 @@ endif
 #Generating openblas.pc
 	@echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
 	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'libsuffix='$(SYMBOLSUFFIX) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
 	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
-	@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
 	@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
 	@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
 	@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"

From 747ade5adf36d2267c4f669238471f4eb793e462 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 18 Oct 2022 17:28:07 +0200
Subject: [PATCH 038/154] fix INTERFACE64/USE64BITINT reporting

---
 cmake/openblas.pc.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in
index 0bd49f996..7e120af86 100644
--- a/cmake/openblas.pc.in
+++ b/cmake/openblas.pc.in
@@ -2,7 +2,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@
 libsuffix=@SUFFIX64_UNDERSCORE@
 includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
-openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ 
+openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ 
 Name: OpenBLAS
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: @OPENBLAS_VERSION@

From 5f72415f10fb19bf7ab4283238c08eac537f267a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 18 Oct 2022 20:29:24 +0200
Subject: [PATCH 039/154] Suffix the pkgconfig file itself in INTERFACE64
 builds

---
 Makefile.install | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/Makefile.install b/Makefile.install
index adef0b5f4..87b5bc870 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -14,6 +14,11 @@ OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
 OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
 OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
 PKG_EXTRALIB := $(EXTRALIB)
+ifeq ($(INTERFACE64),1)
+	SUFFIX64=64
+endif
+PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc"
+
 ifeq ($(USE_OPENMP), 1)
   ifeq ($(C_COMPILER), PGI)
           PKG_EXTRALIB += -lomp
@@ -150,14 +155,19 @@ endif
 endif
 
 #Generating openblas.pc
-	@echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
-	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
-	@echo 'libsuffix='$(SYMBOLSUFFIX) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
-	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
-	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
-	@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
-	@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
-	@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+ifeq ($(INTERFACE64),1)
+	SUFFIX64=64
+endif
+	PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc"
+
+	@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
+	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
+	@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
+	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
+	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
+	@echo 'version='$(VERSION) >> "$(PKGFILE)"
+	@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
+	@cat openblas.pc.in >> "$(PKGFILE)"
 
 
 #Generating OpenBLASConfig.cmake

From 9959a60873fbddc9dea23f4c32cc035147d1f351 Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bartoldeman@users.noreply.github.com>
Date: Thu, 20 Oct 2022 13:28:20 -0400
Subject: [PATCH 040/154] Benchmarks: align malloc'ed buffers.

Benchmarks should allocate with cacheline (often 64 bytes) alignment
to avoid unreliable timings. This technique, storing the offset in the
byte before the pointer, doesn't require C11's aligned_alloc for
compatibility with older compilers.

For example, Glibc's x86_64 malloc returns 16-byte aligned buffers, which is
not sufficient for AVX/AVX2 (32-byte preferred) or AVX512 (64-byte).
---
 benchmark/bench.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/benchmark/bench.h b/benchmark/bench.h
index c03d72bef..f23e487aa 100644
--- a/benchmark/bench.h
+++ b/benchmark/bench.h
@@ -74,6 +74,24 @@ static void *huge_malloc(BLASLONG size){
 
 #endif
 
+/* Benchmarks should allocate with cacheline (often 64 bytes) alignment
+   to avoid unreliable results. This technique, storing the offset in the
+   byte before the pointer, doesn't require C11's aligned_alloc for
+   compatibility with older compilers. */
+static void *aligned_alloc_cacheline(size_t n)
+{
+  void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1);
+  if (p) {
+    void **newp = (void **)
+      (((uintptr_t)p + L1_DATA_LINESIZE) & (uintptr_t)-L1_DATA_LINESIZE);
+    newp[-1] = p;
+    p = newp;
+  }
+  return p;
+}
+#define malloc aligned_alloc_cacheline
+#define free(p) free((p) ? ((void **)(p))[-1] : (p))
+
 #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
   struct timeval start, stop;
 #elif defined(__APPLE__)

From 9e6b060bf3d74dd9eac7325cb9e5cc262a5584a6 Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bartoldeman@users.noreply.github.com>
Date: Thu, 20 Oct 2022 20:11:09 -0400
Subject: [PATCH 041/154] Fix comment.

It stores the pointer, not an offset (that would be an alternative approach).
---
 benchmark/bench.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/bench.h b/benchmark/bench.h
index f23e487aa..1dae4d0fd 100644
--- a/benchmark/bench.h
+++ b/benchmark/bench.h
@@ -75,9 +75,9 @@ static void *huge_malloc(BLASLONG size){
 #endif
 
 /* Benchmarks should allocate with cacheline (often 64 bytes) alignment
-   to avoid unreliable results. This technique, storing the offset in the
-   byte before the pointer, doesn't require C11's aligned_alloc for
-   compatibility with older compilers. */
+   to avoid unreliable results. This technique, storing the allocated
+   pointer value just before the aligned memory, doesn't require
+   C11's aligned_alloc for compatibility with older compilers. */
 static void *aligned_alloc_cacheline(size_t n)
 {
   void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1);

From b00d5b974637bd079c1d8cbbf5c406259aa5a804 Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Wed, 19 Oct 2022 11:36:26 +0800
Subject: [PATCH 042/154] New sbgemm implementation for Neoverse N2

    1. Use UZP instructions but not gather load and scatter store instructions to get lower latency.
    2. Padding k to a power of 4.
---
 common_param.h                                |   1 +
 driver/level3/level3.c                        |  16 +-
 driver/level3/level3_thread.c                 |  14 +-
 kernel/arm64/KERNEL.NEOVERSEN2                |  11 +-
 .../arm64/sbgemm_kernel_neoversen2_newbf16.c  | 467 ++++++++++++++++++
 kernel/arm64/sbgemm_ncopy_4_neoversen2.c      | 137 +++++
 kernel/arm64/sbgemm_tcopy_8_neoversen2.c      | 174 +++++++
 kernel/setparam-ref.c                         |   9 +-
 8 files changed, 818 insertions(+), 11 deletions(-)
 create mode 100644 kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c
 create mode 100644 kernel/arm64/sbgemm_ncopy_4_neoversen2.c
 create mode 100644 kernel/arm64/sbgemm_tcopy_8_neoversen2.c

diff --git a/common_param.h b/common_param.h
index 31fba9059..091840343 100644
--- a/common_param.h
+++ b/common_param.h
@@ -1193,6 +1193,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
 #ifdef BUILD_COMPLEX16
   int    (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); 
 #endif
+  int align_k;  // must be 2^n
 } gotoblas_t;
 
 extern gotoblas_t *gotoblas;
diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index 4a8e193be..d3281345d 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -304,6 +304,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 	while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M;
       }
 
+      BLASLONG pad_min_l = min_l;
+      
+#if defined(HALF) && defined(DYNAMIC_ARCH)
+      pad_min_l = (min_l + gotoblas->align_k - 1) & ~(gotoblas->align_k-1);
+#endif
+
+#if defined(HALF) && !defined(DYNAMIC_ARCH) && defined(NEOVERSEN2)
+      pad_min_l = (min_l + 3) & ~3;
+#endif
+
       /* First, we have to move data A to L2 cache */
       min_i = m_to - m_from;
       l1stride = 1;
@@ -350,7 +360,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 	START_RPCC();
 
 	OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
-			sb + min_l * (jjs - js) * COMPSIZE * l1stride);
+			sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride);
 
 	STOP_RPCC(outercost);
 
@@ -358,10 +368,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
 #if !defined(XDOUBLE)  || !defined(QUAD_PRECISION)
 	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
-			 sa, sb + min_l * (jjs - js)  * COMPSIZE * l1stride, c, ldc, m_from, jjs);
+			 sa, sb + pad_min_l * (jjs - js)  * COMPSIZE * l1stride, c, ldc, m_from, jjs);
 #else
 	KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha,
-			 sa, sb + min_l * (jjs - js)  * COMPSIZE * l1stride, c, ldc, m_from, jjs);
+			 sa, sb + pad_min_l * (jjs - js)  * COMPSIZE * l1stride, c, ldc, m_from, jjs);
 #endif
 
 	STOP_RPCC(kernelcost);
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index dfc7107b8..95c8e6d19 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -324,6 +324,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
     } else {
       if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
     }
+    
+    BLASLONG pad_min_l = min_l;
+
+#if defined(HALF) && defined(DYNAMIC_ARCH)
+    pad_min_l = (min_l + gotoblas->align_k - 1) & ~(gotoblas->align_k-1);
+#endif
+
+#if defined(HALF) && !defined(DYNAMIC_ARCH) && defined(NEOVERSEN2)
+    pad_min_l = (min_l + 3) & ~3;
+#endif
 
     /* Determine step size in m
      * Note: We are currently on the first step in m
@@ -382,13 +392,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
         /* Copy part of local region of B into workspace */
 	START_RPCC();
 	OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
-			buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride);
+			buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride);
 	STOP_RPCC(copy_B);
 
         /* Apply kernel with local region of A and part of local region of B */
 	START_RPCC();
 	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
-			 sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride,
+			 sa, buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride,
 			 c, ldc, m_from, jjs);
 	STOP_RPCC(kernel);
 
diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2
index 07a94a043..7fe9acd5c 100644
--- a/kernel/arm64/KERNEL.NEOVERSEN2
+++ b/kernel/arm64/KERNEL.NEOVERSEN2
@@ -189,11 +189,12 @@ ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 SBGEMM_BETA    =  sbgemm_beta_neoversen2.c
-SBGEMMKERNEL    = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c
-SBGEMMINCOPY    = sbgemm_ncopy_neoversen2.c
-SBGEMMITCOPY    = sbgemm_tcopy_neoversen2.c
-SBGEMMONCOPY    = sbgemm_ncopy_neoversen2.c
-SBGEMMOTCOPY    = sbgemm_tcopy_neoversen2.c
+# SBGEMMKERNEL    = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c
+SBGEMMKERNEL    = sbgemm_kernel_neoversen2_newbf16.c
+SBGEMMINCOPY    = sbgemm_ncopy_4_neoversen2.c
+SBGEMMITCOPY    = sbgemm_tcopy_8_neoversen2.c
+SBGEMMONCOPY    = sbgemm_ncopy_4_neoversen2.c
+SBGEMMOTCOPY    = sbgemm_tcopy_8_neoversen2.c
 SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
 SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c b/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c
new file mode 100644
index 000000000..1bf743c7f
--- /dev/null
+++ b/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c
@@ -0,0 +1,467 @@
+/***************************************************************************
+ * Copyright (c) 2022, The OpenBLAS Project
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of the OpenBLAS project nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * *****************************************************************************/
+
+#include <arm_sve.h>
+
+#include "common.h"
+
+#define LOAD_C(M, N) mc##M##N = svdup_f32(0);
+
+#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N);
+
+#define LOAD_C_8x4 \
+  do {             \
+    LOAD_C(0, 0);  \
+    LOAD_C(0, 1);  \
+    LOAD_C(1, 0);  \
+    LOAD_C(1, 1);  \
+    LOAD_C(2, 0);  \
+    LOAD_C(2, 1);  \
+    LOAD_C(3, 0);  \
+    LOAD_C(3, 1);  \
+  } while (0);
+
+#define STORE_C(PG, PTR, SRC, DST)          \
+  do {                                      \
+    SRC = svld1_f32((PG), (PTR));           \
+    DST = svmad_z((PG), svalpha, DST, SRC); \
+    svst1_f32((PG), (PTR), DST);            \
+  } while (0);
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B,
+          FLOAT *C, BLASLONG ldc) {
+  BLASLONG pad_k = (k + 3) & ~3;
+
+  svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1;
+  svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, 
+    vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7,
+    oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7;
+  svfloat32_t svalpha = svdup_f32(alpha);
+  
+  svbool_t pg16 = svptrue_b16();
+  svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
+  svbool_t pg32 = svptrue_b32();
+  svbool_t pg32_low = svdupq_b32(1, 1, 0, 0);
+  svbool_t pg32_first = svdupq_b32(1, 0, 0, 0);
+
+  bfloat16_t *ptr_a = (bfloat16_t *)A;
+  bfloat16_t *ptr_b = (bfloat16_t *)B;
+  FLOAT *ptr_c = C;
+
+  bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3;
+  bfloat16_t *ptr_b0, *ptr_b1;
+  FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3;
+
+  for (BLASLONG j = 0; j < n / 4; j++) {
+    ptr_c0 = ptr_c;
+    ptr_c1 = ptr_c0 + ldc;
+    ptr_c2 = ptr_c1 + ldc;
+    ptr_c3 = ptr_c2 + ldc;
+    ptr_c += 4 * ldc;
+    ptr_a = (bfloat16_t *)A;
+
+    for (BLASLONG i = 0; i < m / 8; i++) {
+      ptr_a0 = ptr_a;
+      ptr_a += 8 * pad_k;
+
+      ptr_b0 = ptr_b;
+
+      LOAD_C_8x4;
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        ma2 = svld1_bf16(pg16, ptr_a0 + 16);
+        ma3 = svld1_bf16(pg16, ptr_a0 + 24);
+
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
+
+#if 0
+        for (int q = 0; q < 8; q++) {
+          float tmp = 0;
+          *((bfloat16_t *)(&tmp) + 1) = ptr_b0[8+q];
+          printf("%.1f ", tmp);
+        }
+        printf("\n");
+#endif
+
+        MATMUL(0, 0); MATMUL(0, 1);
+        MATMUL(1, 0); MATMUL(1, 1);
+        MATMUL(2, 0); MATMUL(2, 1);
+        MATMUL(3, 0); MATMUL(3, 1);
+
+        ptr_a0 += 32;
+        ptr_b0 += 16;
+      }
+
+      vc0 = svuzp1(mc00, mc10);
+      vc1 = svuzp1(mc20, mc30);
+      vc2 = svuzp2(mc00, mc10);
+      vc3 = svuzp2(mc20, mc30);
+      vc4 = svuzp1(mc01, mc11);
+      vc5 = svuzp1(mc21, mc31);
+      vc6 = svuzp2(mc01, mc11);
+      vc7 = svuzp2(mc21, mc31);
+
+      STORE_C(pg32, ptr_c0, oc0, vc0);
+      STORE_C(pg32, ptr_c0+4, oc1, vc1);
+      STORE_C(pg32, ptr_c1, oc2, vc2);
+      STORE_C(pg32, ptr_c1+4, oc3, vc3);
+      STORE_C(pg32, ptr_c2, oc4, vc4)
+      STORE_C(pg32, ptr_c2+4, oc5, vc5);
+      STORE_C(pg32, ptr_c3, oc6, vc6)
+      STORE_C(pg32, ptr_c3+4, oc7, vc7);
+
+      ptr_c0 += 8;
+      ptr_c1 += 8;
+      ptr_c2 += 8;
+      ptr_c3 += 8;
+    }
+
+    if (m & 4) {
+      ptr_a0 = ptr_a;
+      ptr_a += 4 * pad_k;
+      ptr_b0 = ptr_b;
+
+      LOAD_C(0, 0); LOAD_C(0, 1);
+      LOAD_C(1, 0); LOAD_C(1, 1);
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
+
+        MATMUL(0, 0); MATMUL(0, 1);
+        MATMUL(1, 0); MATMUL(1, 1);
+
+        ptr_a0 += 16;
+        ptr_b0 += 16;
+      }
+
+      vc0 = svuzp1(mc00, mc10);
+      vc1 = svuzp2(mc00, mc10);
+      vc2 = svuzp1(mc01, mc11);
+      vc3 = svuzp2(mc01, mc11);
+
+      STORE_C(pg32, ptr_c0, oc0, vc0);
+      STORE_C(pg32, ptr_c1, oc1, vc1);
+      STORE_C(pg32, ptr_c2, oc2, vc2);
+      STORE_C(pg32, ptr_c3, oc3, vc3);
+
+      ptr_c0 += 4;
+      ptr_c1 += 4;
+      ptr_c2 += 4;
+      ptr_c3 += 4;
+    }
+
+    if (m & 2) {
+      ptr_a0 = ptr_a;
+      ptr_a += 2 * pad_k;
+      ptr_b0 = ptr_b;
+
+      LOAD_C(0, 0); LOAD_C(0, 1);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
+
+        MATMUL(0, 0); MATMUL(0, 1);
+
+        ptr_a0 += 8;
+        ptr_b0 += 16;
+      }
+
+      vc0 = svuzp1(mc00, mc00);
+      vc1 = svuzp2(mc00, mc00);
+      vc2 = svuzp1(mc01, mc01);
+      vc3 = svuzp2(mc01, mc01);
+
+      STORE_C(pg32_low, ptr_c0, oc0, vc0);
+      STORE_C(pg32_low, ptr_c1, oc1, vc1);
+      STORE_C(pg32_low, ptr_c2, oc2, vc2);
+      STORE_C(pg32_low, ptr_c3, oc3, vc3);
+
+      ptr_c0 += 2;
+      ptr_c1 += 2;
+      ptr_c2 += 2;
+      ptr_c3 += 2;
+    }
+
+    if (m & 1) {
+      ptr_a0 = ptr_a;
+      ptr_b0 = ptr_b;
+
+      LOAD_C(0, 0); LOAD_C(0, 1);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16_low, ptr_a0);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
+
+        MATMUL(0, 0); MATMUL(0, 1);
+
+        ptr_a0 += 4;
+        ptr_b0 += 16;
+      }
+
+      vc1 = svuzp2(mc00, mc00);
+      vc3 = svuzp2(mc01, mc01);
+
+      STORE_C(pg32_first, ptr_c0, oc0, mc00);
+      STORE_C(pg32_first, ptr_c1, oc1, vc1);
+      STORE_C(pg32_first, ptr_c2, oc2, mc01);
+      STORE_C(pg32_first, ptr_c3, oc3, vc3);
+
+    }
+
+    ptr_b += 4 * pad_k;
+  }
+
+  if (n & 2) {
+    ptr_c0 = ptr_c;
+    ptr_c1 = ptr_c0 + ldc;
+    ptr_c += 2 * ldc;
+    ptr_a = (bfloat16_t *)A;
+
+    for (BLASLONG i = 0; i < m / 8; i++) {
+      ptr_a0 = ptr_a;
+      ptr_a += 8 * pad_k;
+
+      ptr_b0 = ptr_b;
+
+      LOAD_C(0, 0);
+      LOAD_C(1, 0);
+      LOAD_C(2, 0);
+      LOAD_C(3, 0);
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        ma2 = svld1_bf16(pg16, ptr_a0 + 16);
+        ma3 = svld1_bf16(pg16, ptr_a0 + 24);
+
+        mb0 = svld1_bf16(pg16, ptr_b0);
+
+        MATMUL(0, 0);
+        MATMUL(1, 0);
+        MATMUL(2, 0);
+        MATMUL(3, 0);
+
+        ptr_a0 += 32;
+        ptr_b0 += 8;
+      }
+
+      vc0 = svuzp1(mc00, mc10);
+      vc1 = svuzp1(mc20, mc30);
+      vc2 = svuzp2(mc00, mc10);
+      vc3 = svuzp2(mc20, mc30);
+
+      STORE_C(pg32, ptr_c0, oc0, vc0);
+      STORE_C(pg32, ptr_c0 + 4, oc1, vc1);
+      STORE_C(pg32, ptr_c1, oc2, vc2);
+      STORE_C(pg32, ptr_c1 + 4, oc3, vc3);
+
+      ptr_c0 += 8;
+      ptr_c1 += 8;
+    }
+
+    if (m & 4) {
+      ptr_a0 = ptr_a;
+      ptr_a += 4 * pad_k;
+      ptr_b0 = ptr_b;
+
+      LOAD_C(0, 0);
+      LOAD_C(1, 0);
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        MATMUL(0, 0);
+        MATMUL(1, 0);
+        ptr_a0 += 16;
+        ptr_b0 += 8;
+      }
+
+      vc0 = svuzp1(mc00, mc10);
+      vc1 = svuzp2(mc00, mc10);
+
+      STORE_C(pg32, ptr_c0, oc0, vc0);
+      STORE_C(pg32, ptr_c1, oc1, vc1);
+
+      ptr_c0 += 4;
+      ptr_c1 += 4;
+    }
+
+    if (m & 2) {
+      ptr_a0 = ptr_a;
+      ptr_a += 2 * pad_k;
+      ptr_b0 = ptr_b;
+
+      LOAD_C(0, 0);
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+
+        MATMUL(0, 0);
+
+        ptr_a0 += 8;
+        ptr_b0 += 8;
+      }
+
+      vc0 = svuzp1(mc00, mc00);
+      vc1 = svuzp2(mc00, mc00);
+      STORE_C(pg32_low, ptr_c0, oc0, vc0);
+      STORE_C(pg32_low, ptr_c1, oc1, vc1);
+
+      ptr_c0 += 2;
+      ptr_c1 += 2;
+
+    }
+
+    if (m & 1) {
+      ptr_a0 = ptr_a;
+      ptr_b0 = ptr_b;
+      LOAD_C(0, 0);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16_low, ptr_a0);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        MATMUL(0, 0);
+        ptr_a0 += 4;
+        ptr_b0 += 8;
+      }
+      vc1 = svuzp2(mc00, mc00);
+
+      STORE_C(pg32_first, ptr_c0, oc0, mc00);
+      STORE_C(pg32_first, ptr_c1, oc1, vc1);
+    }
+
+    ptr_b += 2 * pad_k;
+  }
+
+  if (n & 1) {
+    ptr_c0 = ptr_c;
+    ptr_a = (bfloat16_t *)A;
+
+    for (BLASLONG i = 0; i < m / 8; i++) {
+      ptr_a0 = ptr_a;
+      ptr_a += 8 * pad_k;
+
+      ptr_b0 = ptr_b;
+
+      LOAD_C(0, 0);
+      LOAD_C(1, 0);
+      LOAD_C(2, 0);
+      LOAD_C(3, 0);
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        ma2 = svld1_bf16(pg16, ptr_a0 + 16);
+        ma3 = svld1_bf16(pg16, ptr_a0 + 24);
+
+        mb0 = svld1_bf16(pg16_low, ptr_b0);
+
+        MATMUL(0, 0);
+        MATMUL(1, 0);
+        MATMUL(2, 0);
+        MATMUL(3, 0);
+
+        ptr_a0 += 32;
+        ptr_b0 += 4;
+      }
+
+      vc0 = svuzp1(mc00, mc10);
+      vc1 = svuzp1(mc20, mc30);
+
+      STORE_C(pg32, ptr_c0, oc0, vc0);
+      STORE_C(pg32, ptr_c0 + 4, oc1, vc1);
+
+      ptr_c0 += 8;
+    }
+
+    if (m & 4) {
+      ptr_a0 = ptr_a;
+      ptr_a += 4 * pad_k;
+      ptr_b0 = ptr_b;
+      LOAD_C(0, 0);
+      LOAD_C(1, 0);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        mb0 = svld1_bf16(pg16_low, ptr_b0);
+        MATMUL(0, 0);
+        MATMUL(1, 0);
+        ptr_a0 += 16;
+        ptr_b0 += 4;
+      }
+      vc0 = svuzp1(mc00, mc10);
+      STORE_C(pg32, ptr_c0, oc0, vc0);
+      ptr_c0 += 4;
+    }
+
+    if (m & 2) {
+      ptr_a0 = ptr_a;
+      ptr_a += 2 * pad_k;
+      ptr_b0 = ptr_b;
+
+      LOAD_C(0, 0);
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        mb0 = svld1_bf16(pg16_low, ptr_b0);
+
+        MATMUL(0, 0);
+
+        ptr_a0 += 8;
+        ptr_b0 += 4;
+      }
+      vc0 = svuzp1(mc00, mc00);
+      STORE_C(pg32_low, ptr_c0, oc0, vc0);
+      ptr_c0 += 2;
+    }
+
+    if (m & 1) {
+      ptr_a0 = ptr_a;
+      ptr_b0 = ptr_b;
+      LOAD_C(0, 0);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16_low, ptr_a0);
+        mb0 = svld1_bf16(pg16_low, ptr_b0);
+        MATMUL(0, 0);
+        ptr_a0 += 4;
+        ptr_b0 += 4;
+      }
+      STORE_C(pg32_first, ptr_c0, oc0, mc00);
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/arm64/sbgemm_ncopy_4_neoversen2.c b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c
new file mode 100644
index 000000000..0b0e7a427
--- /dev/null
+++ b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c
@@ -0,0 +1,137 @@
+/***************************************************************************
+ * Copyright (c) 2022, The OpenBLAS Project
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of the OpenBLAS project nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * *****************************************************************************/
+
+#include <arm_sve.h>
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
+  IFLOAT *a_offset;
+  IFLOAT *a_offsetx[4];
+  IFLOAT *b_offset;
+  a_offset = a;
+  b_offset = b;
+
+  svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
+  svbfloat16_t v0, v1, v2, v3;
+
+  for (BLASLONG j = 0; j < n / 4; j++) {
+    a_offsetx[0] = a_offset;
+    a_offsetx[1] = a_offsetx[0] + lda;
+    a_offsetx[2] = a_offsetx[1] + lda;
+    a_offsetx[3] = a_offsetx[2] + lda;
+    a_offset += 4 * lda;
+
+    for (BLASLONG i = 0; i < m / 4; i++) {
+      v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]);
+      v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]);
+      v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]);
+      v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]);
+
+      svst1_bf16(pg16, (bfloat16_t *)b_offset, v0);
+      svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1);
+      svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2);
+      svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3);
+
+#if 0
+      for (int line = 0; line < 4; line++) {
+        for (int p = 0; p < 4; p++) {
+          float tmp = 0;
+          *((bfloat16 *)(&tmp) + 1) = b_offset[line * 4 + p];
+          printf("%f ", tmp);
+        }
+        printf("\n");
+      }
+#endif
+
+      b_offset += 16;
+      a_offsetx[0] += 4;
+      a_offsetx[1] += 4;
+      a_offsetx[2] += 4;
+      a_offsetx[3] += 4;
+    }
+
+    if (m & 3) {
+      BLASLONG rest = m & 3;
+      for (BLASLONG col = 0; col < 4; col++) {
+        b_offset[4 * col] = a_offsetx[col][0];
+        b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1];
+        b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2];
+        b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3];
+      }
+      b_offset += 16;
+    }
+  }
+
+  if (n & 2) {
+    a_offsetx[0] = a_offset;
+    a_offsetx[1] = a_offsetx[0] + lda;
+    a_offset += 2 * lda;
+
+    for (BLASLONG i = 0; i < m / 4; i++) {
+      v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]);
+      v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]);
+      svst1_bf16(pg16, (bfloat16_t *)b_offset, v0);
+      svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1);
+
+      b_offset += 8;
+      a_offsetx[0] += 4;
+      a_offsetx[1] += 4;
+    }
+
+    if (m & 3) {
+      BLASLONG rest = m & 3;
+      for (BLASLONG col = 0; col < 2; col++) {
+        b_offset[4 * col] = a_offsetx[col][0];
+        b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1];
+        b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2];
+        b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3];
+      }
+      b_offset += 8;
+    }
+  }
+
+  if (n & 1) {
+    a_offsetx[0] = a_offset;
+    for (BLASLONG i = 0; i < m / 4; i++) {
+      v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]);
+      svst1_bf16(pg16, (bfloat16_t *)b_offset, v0);
+      b_offset += 4;
+      a_offsetx[0] += 4;
+    }
+    if (m & 3) {
+      BLASLONG rest = m & 3;
+      b_offset[0] = a_offsetx[0][0];
+      b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1];
+      b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2];
+      b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3];
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/arm64/sbgemm_tcopy_8_neoversen2.c b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c
new file mode 100644
index 000000000..6c37e4bcf
--- /dev/null
+++ b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c
@@ -0,0 +1,174 @@
+/***************************************************************************
+ * Copyright (c) 2022, The OpenBLAS Project
+ * All rights reserved.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * 3. Neither the name of the OpenBLAS project nor the names of
+ * its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * *****************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
+  IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3;
+  IFLOAT *b_offset;
+  a_offset = a;
+  b_offset = b;
+
+  for (BLASLONG j = 0; j < n / 8; j++) {
+    a_offset0 = a_offset;
+    a_offset1 = a_offset0 + lda;
+    a_offset2 = a_offset1 + lda;
+    a_offset3 = a_offset2 + lda;
+    a_offset += 8;
+
+    for (BLASLONG i = 0; i < m / 4; i++) {
+      for (BLASLONG line = 0; line < 8; line++) {
+#if 0
+        float fv0 = 0, fv1 = 0, fv2 = 0, fv3 = 0;
+        *((bfloat16 *)(&fv0) + 1) = a_offset0[line];
+        *((bfloat16 *)(&fv1) + 1) = a_offset1[line];
+        *((bfloat16 *)(&fv2) + 1) = a_offset2[line];
+        *((bfloat16 *)(&fv3) + 1) = a_offset3[line];
+        printf("%f %f %f %f\n", fv0, fv1, fv2, fv3);
+#endif
+
+        b_offset[line * 4] = a_offset0[line];
+        b_offset[line * 4 + 1] = a_offset1[line];
+        b_offset[line * 4 + 2] = a_offset2[line];
+        b_offset[line * 4 + 3] = a_offset3[line];
+      }
+
+      b_offset += 32;
+      a_offset0 += 4 * lda;
+      a_offset1 += 4 * lda;
+      a_offset2 += 4 * lda;
+      a_offset3 += 4 * lda;
+    }
+
+    if (m & 3) {
+      BLASLONG rest = m & 3;
+      for (BLASLONG line = 0; line < 8; line++) {
+        b_offset[line * 4] = a_offset0[line];
+        b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line];
+        b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line];
+        b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line];
+      }
+      b_offset += 32;
+    }
+  }
+
+  if (n & 4) {
+    a_offset0 = a_offset;
+    a_offset1 = a_offset0 + lda;
+    a_offset2 = a_offset1 + lda;
+    a_offset3 = a_offset2 + lda;
+    a_offset += 4;
+
+    for (BLASLONG i = 0; i < m / 4; i++) {
+      for (BLASLONG line = 0; line < 4; line++) {
+        b_offset[line * 4] = a_offset0[line];
+        b_offset[line * 4 + 1] = a_offset1[line];
+        b_offset[line * 4 + 2] = a_offset2[line];
+        b_offset[line * 4 + 3] = a_offset3[line];
+      }
+
+      b_offset += 16;
+      a_offset0 += 4 * lda;
+      a_offset1 += 4 * lda;
+      a_offset2 += 4 * lda;
+      a_offset3 += 4 * lda;
+    }
+
+    if (m & 3) {
+      BLASLONG rest = m & 3;
+      for (BLASLONG line = 0; line < 4; line++) {
+        b_offset[line * 4] = a_offset0[line];
+        b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line];
+        b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line];
+        b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line];
+      }
+      b_offset += 16;
+    }
+  }
+
+  if (n & 2) {
+    a_offset0 = a_offset;
+    a_offset1 = a_offset0 + lda;
+    a_offset2 = a_offset1 + lda;
+    a_offset3 = a_offset2 + lda;
+    a_offset += 2;
+
+    for (BLASLONG i = 0; i < m / 4; i++) {
+      for (BLASLONG line = 0; line < 2; line++) {
+        b_offset[line * 4] = a_offset0[line];
+        b_offset[line * 4 + 1] = a_offset1[line];
+        b_offset[line * 4 + 2] = a_offset2[line];
+        b_offset[line * 4 + 3] = a_offset3[line];
+      }
+      b_offset += 8;
+      a_offset0 += 4 * lda;
+      a_offset1 += 4 * lda;
+      a_offset2 += 4 * lda;
+      a_offset3 += 4 * lda;
+    }
+
+    if (m & 3) {
+      BLASLONG rest = m & 3;
+      for (BLASLONG line = 0; line < 2; line++) {
+        b_offset[line * 4] = a_offset0[line];
+        b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line];
+        b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line];
+        b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line];
+      }
+      b_offset += 8;
+    }
+  }
+
+  if (n & 1) {
+    a_offset0 = a_offset;
+    a_offset1 = a_offset0 + lda;
+    a_offset2 = a_offset1 + lda;
+    a_offset3 = a_offset2 + lda;
+
+    for (BLASLONG i = 0; i < m / 4; i++) {
+      b_offset[0] = *a_offset0;
+      b_offset[1] = *a_offset1;
+      b_offset[2] = *a_offset2;
+      b_offset[3] = *a_offset3;
+      b_offset += 4;
+      a_offset0 += 4 * lda;
+      a_offset1 += 4 * lda;
+      a_offset2 += 4 * lda;
+      a_offset3 += 4 * lda;
+    }
+
+    if (m & 3) {
+      BLASLONG rest = m & 3;
+      b_offset[0] = *a_offset0;
+      b_offset[1] = rest == 1 ? 0 : *a_offset1;
+      b_offset[2] = rest <= 2 ? 0 : *a_offset2;
+      b_offset[3] = rest <= 3 ? 0 : *a_offset3;
+    }
+  }
+  return 0;
+}
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 8bcd31ef2..010c39bd4 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -866,8 +866,9 @@ gotoblas_t TABLE_NAME = {
   cgeadd_kTS,
 #endif
 #if BUILD_COMPLEX16==1
-  zgeadd_kTS
+  zgeadd_kTS,
 #endif
+  0,  // padding_k
 };
 
 #if (ARCH_ARM64)
@@ -972,6 +973,12 @@ static void init_parameter(void) {
   TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r;
 #endif
 #endif
+  
+#if defined(NEOVERSEN2) && BUILD_BFLOAT16 == 1
+  TABLE_NAME.align_k = 4;
+#else
+  TABLE_NAME.align_k = 1;
+#endif
 
 }
 #else // (ARCH_ARM64)

From 843e9fd0b9fb428a9d715cdc6506e57395311a27 Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Wed, 26 Oct 2022 17:06:06 +0800
Subject: [PATCH 043/154] Fix typo error

---
 kernel/setparam-ref.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 010c39bd4..effcf8965 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -868,7 +868,7 @@ gotoblas_t TABLE_NAME = {
 #if BUILD_COMPLEX16==1
   zgeadd_kTS,
 #endif
-  0,  // padding_k
+  1,  // align_k
 };
 
 #if (ARCH_ARM64)

From e7fd8d21a6f88b098a25ab76d3360efa1d38f830 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 26 Oct 2022 15:33:58 +0200
Subject: [PATCH 044/154] Add GEMMT based on looped GEMV

---
 interface/CMakeLists.txt |   2 +-
 interface/Makefile       |  57 +++-
 interface/gemmt.c        | 589 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 637 insertions(+), 11 deletions(-)
 create mode 100644 interface/gemmt.c

diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 0b2998237..654684b71 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES
 # these do not have separate 'z' sources
 set(BLAS3_SOURCES
   gemm.c symm.c
-  trsm.c syrk.c syr2k.c
+  trsm.c syrk.c syr2k.c gemmt.c
 )
 
 set(BLAS3_MANGLED_SOURCES
diff --git a/interface/Makefile b/interface/Makefile
index abdac96e1..a1f4f66da 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -44,12 +44,12 @@ SBLAS3OBJS    = \
 		sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \
 		strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \
 		somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
-		sgeadd.$(SUFFIX)
+		sgeadd.$(SUFFIX) sgemmt.$(SUFFIX)
 
 ifeq ($(BUILD_BFLOAT16),1)
 SBBLAS1OBJS    = sbdot.$(SUFFIX)
 SBBLAS2OBJS    = sbgemv.$(SUFFIX)
-SBBLAS3OBJS    = sbgemm.$(SUFFIX)
+SBBLAS3OBJS    = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX)
 SBEXTOBJS      = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
 endif
 
@@ -76,7 +76,7 @@ DBLAS3OBJS    = \
 		dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \
 		dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \
 		domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\
-		dgeadd.$(SUFFIX) 
+		dgeadd.$(SUFFIX) dgemmt.$(SUFFIX)
 
 CBLAS1OBJS    = \
 		caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
@@ -105,7 +105,7 @@ CBLAS3OBJS    = \
 		ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \
 	       	chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \
 		comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\
-		cgeadd.$(SUFFIX) 
+		cgeadd.$(SUFFIX) cgemmt.$(SUFFIX)
 
 ZBLAS1OBJS    = \
 		zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
@@ -134,7 +134,7 @@ ZBLAS3OBJS    = \
 		ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \
 	       	zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \
 		zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\
-		zgeadd.$(SUFFIX) 
+		zgeadd.$(SUFFIX) zgemmt.$(SUFFIX)
 
 ifeq ($(SUPPORT_GEMM3M), 1)
 
@@ -281,12 +281,12 @@ CSBLAS2OBJS   = \
 CSBLAS3OBJS   = \
 	cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
 	cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX)  cblas_simatcopy.$(SUFFIX)\
-	cblas_sgeadd.$(SUFFIX)
+	cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX)
 
 ifeq ($(BUILD_BFLOAT16),1)
 CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX)
 CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX)
-CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX)
+CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX)
 CSBEXTOBJS   = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
 endif
 
@@ -306,7 +306,7 @@ CDBLAS2OBJS   = \
 CDBLAS3OBJS   += \
 	cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
 	cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX)  cblas_dimatcopy.$(SUFFIX) \
-        cblas_dgeadd.$(SUFFIX) 
+        cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX)
 
 CCBLAS1OBJS   = \
 	cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX)  cblas_caxpy.$(SUFFIX) \
@@ -331,7 +331,7 @@ CCBLAS3OBJS   = \
 	cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
 	cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
 	cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
-	cblas_cgeadd.$(SUFFIX)
+	cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX)
 	
 CXERBLAOBJ = \
 	cblas_xerbla.$(SUFFIX)
@@ -362,7 +362,7 @@ CZBLAS3OBJS   = \
 	cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
 	cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
 	cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \
-	cblas_zgeadd.$(SUFFIX)
+	cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX)
 
 
 ifeq ($(SUPPORT_GEMM3M), 1)
@@ -1300,6 +1300,8 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
 ifeq ($(BUILD_BFLOAT16),1)
 sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
+sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
 endif
 
 sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
@@ -1320,6 +1322,24 @@ zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h
 xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+xgemmt.$(SUFFIX) xgemm.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
 ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
@@ -1907,6 +1927,23 @@ cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h
 cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 
+cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
+
+ifeq ($(BUILD_BFLOAT16),1)
+cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
+endif
+
+cblas_dgemmt.$(SUFFIX) cblas_dgemmt.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
+
+cblas_cgemmt.$(SUFFIX) cblas_cgemmt.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
+
+cblas_zgemmt.$(SUFFIX) cblas_zgemmt.$(PSUFFIX) : gemmt.c ../param.h
+	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
+
 cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 
diff --git a/interface/gemmt.c b/interface/gemmt.c
new file mode 100644
index 000000000..3eed1dfe4
--- /dev/null
+++ b/interface/gemmt.c
@@ -0,0 +1,589 @@
+/*********************************************************************/
+/* Copyright 2022, The OpenBLAS Project.                             */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#ifndef COMPLEX
+#define SMP_THRESHOLD_MIN 65536.0
+#ifdef XDOUBLE
+#define ERROR_NAME "QGEMT "
+#elif defined(DOUBLE)
+#define ERROR_NAME "DGEMT "
+#elif defined(BFLOAT16)
+#define ERROR_NAME "SBGEMT "
+#else
+#define ERROR_NAME "SGEMT "
+#endif
+#else
+#define SMP_THRESHOLD_MIN 8192.0
+#ifdef XDOUBLE
+#define ERROR_NAME "XGEMT "
+#elif defined(DOUBLE)
+#define ERROR_NAME "ZGEMT "
+#else
+#define ERROR_NAME "CGEMT "
+#endif
+#endif
+
+#ifndef GEMM_MULTITHREAD_THRESHOLD
+#define GEMM_MULTITHREAD_THRESHOLD 4
+#endif
+
+#ifndef CBLAS
+
+void NAME(char *UPLO, char *TRANSA, char *TRANSB,
+	  blasint * M, blasint * N, blasint * K,
+	  FLOAT * Alpha,
+	  IFLOAT * a, blasint * ldA,
+	  IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
+{
+
+	blasint m, n, k;
+	blasint lda, ldb, ldc;
+	int transa, transb, uplo;
+	blasint info;
+
+	char transA, transB, Uplo;
+	IFLOAT *buffer;
+	IFLOAT *aa, *bb;
+	FLOAT *cc;
+#if defined(COMPLEX)
+	FLOAT alpha_r, alpha_i, beta_r, beta_i;
+#else
+	FLOAT alpha, beta;
+#endif
+
+	PRINT_DEBUG_NAME;
+
+	m = *M;
+	n = *N;
+	k = *K;
+
+#if defined(COMPLEX)
+	FLOAT *alpha = Alpha;
+	alpha_r = *(Alpha + 0);
+	alpha_i = *(Alpha + 1);
+
+	beta_r = *(Beta + 0);
+	beta_i = *(Beta + 1);
+#else
+	alpha = *Alpha;
+	beta = *Beta;
+#endif
+
+	lda = *ldA;
+	ldb = *ldB;
+	ldc = *ldC;
+
+	transA = *TRANSA;
+	transB = *TRANSB;
+	Uplo = *UPLO;
+	TOUPPER(transA);
+	TOUPPER(transB);
+	TOUPPER(Uplo);
+
+	transa = -1;
+	transb = -1;
+	uplo = -1;
+
+	if (transA == 'N')
+		transa = 0;
+	if (transA == 'T')
+		transa = 1;
+#ifndef COMPLEX
+	if (transA == 'R')
+		transa = 0;
+	if (transA == 'C')
+		transa = 1;
+#else
+	if (transA == 'R')
+		transa = 2;
+	if (transA == 'C')
+		transa = 3;
+#endif
+
+	if (transB == 'N')
+		transb = 0;
+	if (transB == 'T')
+		transb = 1;
+#ifndef COMPLEX
+	if (transB == 'R')
+		transb = 0;
+	if (transB == 'C')
+		transb = 1;
+#else
+	if (transB == 'R')
+		transb = 2;
+	if (transB == 'C')
+		transb = 3;
+#endif
+
+	if (Uplo == 'U')
+		uplo = 0;
+	if (Uplo == 'L')
+		uplo = 1;
+
+	info = 0;
+
+	if (uplo < 0)
+		info = 14;
+	if (ldc < m)
+		info = 13;
+	if (k < 0)
+		info = 5;
+	if (n < 0)
+		info = 4;
+	if (m < 0)
+		info = 3;
+	if (transb < 0)
+		info = 2;
+	if (transa < 0)
+		info = 1;
+
+	if (info) {
+		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
+		return;
+	}
+#else
+
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
+	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
+	   blasint N, blasint k,
+#ifndef COMPLEX
+	   FLOAT alpha,
+	   IFLOAT * A, blasint LDA,
+	   IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
+{
+#else
+	   void *valpha,
+	   void *va, blasint LDA,
+	   void *vb, blasint LDB, void *vbeta, void *vc, blasint ldc)
+{
+	FLOAT *alpha = (FLOAT *) valpha;
+	FLOAT *beta = (FLOAT *) vbeta;
+	FLOAT *A = (FLOAT *) va;
+	FLOAT *B = (FLOAT *) vb;
+	FLOAT *c = (FLOAT *) vc;
+#endif
+	FLOAT *aa, *bb, *cc;
+
+	int transa, transb, uplo;
+	blasint info;
+	blasint m, n, lda, ldb;
+	FLOAT *a, *b;
+	XFLOAT *buffer;
+
+	PRINT_DEBUG_CNAME;
+
+	transa = -1;
+	transb = -1;
+	info = 0;
+
+	if (order == CblasColMajor) {
+
+		if (TransA == CblasNoTrans)
+			transa = 0;
+		if (TransA == CblasTrans)
+			transa = 1;
+#ifndef COMPLEX
+		if (TransA == CblasConjNoTrans)
+			transa = 0;
+		if (TransA == CblasConjTrans)
+			transa = 1;
+#else
+		if (TransA == CblasConjNoTrans)
+			transa = 2;
+		if (TransA == CblasConjTrans)
+			transa = 3;
+#endif
+		if (TransB == CblasNoTrans)
+			transb = 0;
+		if (TransB == CblasTrans)
+			transb = 1;
+#ifndef COMPLEX
+		if (TransB == CblasConjNoTrans)
+			transb = 0;
+		if (TransB == CblasConjTrans)
+			transb = 1;
+#else
+		if (TransB == CblasConjNoTrans)
+			transb = 2;
+		if (TransB == CblasConjTrans)
+			transb = 3;
+#endif
+
+		m = M;
+		n = N;
+
+		a = (void *)A;
+		b = (void *)B;
+		lda = LDA;
+		ldb = LDB;
+
+		info = -1;
+
+		if (ldc < m)
+			info = 13;
+		if (k < 0)
+			info = 5;
+		if (n < 0)
+			info = 4;
+		if (m < 0)
+			info = 3;
+		if (transb < 0)
+			info = 2;
+		if (transa < 0)
+			info = 1;
+	}
+
+	if (order == CblasRowMajor) {
+		m = N;
+		n = M;
+
+		a = (void *)B;
+		b = (void *)A;
+
+		lda = LDB;
+		ldb = LDA;
+
+		if (TransB == CblasNoTrans)
+			transa = 0;
+		if (TransB == CblasTrans)
+			transa = 1;
+#ifndef COMPLEX
+		if (TransB == CblasConjNoTrans)
+			transa = 0;
+		if (TransB == CblasConjTrans)
+			transa = 1;
+#else
+		if (TransB == CblasConjNoTrans)
+			transa = 2;
+		if (TransB == CblasConjTrans)
+			transa = 3;
+#endif
+		if (TransA == CblasNoTrans)
+			transb = 0;
+		if (TransA == CblasTrans)
+			transb = 1;
+#ifndef COMPLEX
+		if (TransA == CblasConjNoTrans)
+			transb = 0;
+		if (TransA == CblasConjTrans)
+			transb = 1;
+#else
+		if (TransA == CblasConjNoTrans)
+			transb = 2;
+		if (TransA == CblasConjTrans)
+			transb = 3;
+#endif
+
+		info = -1;
+
+		if (ldc < m)
+			info = 13;
+		if (k < 0)
+			info = 5;
+		if (n < 0)
+			info = 4;
+		if (m < 0)
+			info = 3;
+		if (transb < 0)
+			info = 2;
+		if (transa < 0)
+			info = 1;
+
+	}
+
+	uplo = -1;
+	if (Uplo == CblasUpper)
+		uplo = 0;
+	if (Uplo == CblasLower)
+		uplo = 1;
+	if (uplo < 0)
+		info = 14;
+
+	if (info >= 0) {
+		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
+		return;
+	}
+#if defined(COMPLEX)
+	FLOAT alpha_r = *(alpha + 0);
+	FLOAT alpha_i = *(alpha + 1);
+
+	FLOAT beta_r = *(beta + 0);
+	FLOAT beta_i = *(beta + 1);
+#endif
+
+#endif
+	int buffer_size;
+	blasint l;
+	blasint i, j;
+
+#ifdef SMP
+	int nthreads;
+#endif
+
+#if defined(COMPLEX)
+
+#ifdef SMP
+	static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT *, FLOAT *,
+				     BLASLONG, FLOAT *, BLASLONG, FLOAT *,
+				     BLASLONG, FLOAT *, int) = {
+#ifdef XDOUBLE
+		xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c,
+		    xgemv_thread_o, xgemv_thread_u, xgemv_thread_s,
+		    xgemv_thread_d,
+#elif defined DOUBLE
+		zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c,
+		    zgemv_thread_o, zgemv_thread_u, zgemv_thread_s,
+		    zgemv_thread_d,
+#else
+		cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c,
+		    cgemv_thread_o, cgemv_thread_u, cgemv_thread_s,
+		    cgemv_thread_d,
+#endif
+	};
+#endif
+
+	int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *,
+		       BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG,
+		       FLOAT *) = {
+	GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D,};
+
+#else
+
+#ifdef SMP
+	static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, FLOAT *,
+				     BLASLONG, FLOAT *, BLASLONG, FLOAT *,
+				     BLASLONG, FLOAT *, int) = {
+#ifdef XDOUBLE
+		qgemv_thread_n, qgemv_thread_t,
+#elif defined DOUBLE
+		dgemv_thread_n, dgemv_thread_t,
+#else
+		sgemv_thread_n, sgemv_thread_t,
+#endif
+	};
+#endif
+	int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG,
+		       FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
+	GEMV_N, GEMV_T,};
+
+#endif
+
+	if ((m == 0) || (n == 0))
+		return;
+
+	IDEBUG_START;
+
+	FUNCTION_PROFILE_START();
+
+	const blasint incb = (transb == 0) ? 1 : ldb;
+
+	if (uplo == 1) {
+		for (i = 0; i < n; i++) {
+			j = n - i;
+
+			l = j;
+#if defined(COMPLEX)
+			aa = a + i * 2;
+			bb = b + i * ldb * 2;
+			if (transa) {
+				l = k;
+				aa = a + lda * i * 2;
+				bb = b + i * 2;
+			}
+			cc = c + i * 2 * ldc + i * 2;
+#else
+			aa = a + i;
+			bb = b + i * ldb;
+			if (transa) {
+				l = k;
+				aa = a + lda * i;
+				bb = b + i;
+			}
+			cc = c + i * ldc + i;
+#endif
+
+#if defined(COMPLEX)
+			if (beta_r != ONE || beta_i != ZERO)
+				SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0,
+				       NULL, 0);
+
+			if (alpha_r == ZERO && alpha_i == ZERO)
+				return;
+#else
+			if (beta != ONE)
+				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
+
+			if (alpha == ZERO)
+				continue;
+#endif
+
+			IDEBUG_START;
+
+			FUNCTION_PROFILE_START();
+
+			buffer_size = j + k + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+			buffer_size += 160 / sizeof(FLOAT);
+#endif
+			// for alignment
+			buffer_size = (buffer_size + 3) & ~3;
+			STACK_ALLOC(buffer_size, FLOAT, buffer);
+
+#ifdef SMP
+
+			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
+				nthreads = 1;
+			else
+				nthreads = num_cpu_avail(2);
+
+			if (nthreads == 1) {
+#endif
+
+#if defined(COMPLEX)
+				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
+						     aa, lda, bb, incb, cc, 1,
+						     buffer);
+#else
+				(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
+						     bb, incb, cc, 1, buffer);
+#endif
+#ifdef SMP
+			} else {
+
+				(gemv_thread[(int)transa]) (j, k, alpha, aa,
+							    lda, bb, incb, cc,
+							    1, buffer,
+							    nthreads);
+
+			}
+#endif
+
+			STACK_FREE(buffer);
+		}
+	} else {
+
+		for (i = 0; i < n; i++) {
+			j = i + 1;
+
+			l = j;
+#if defined COMPLEX
+			bb = b + i * ldb * 2;
+			if (transa) {
+				l = k;
+				bb = b + i * 2;
+			}
+			cc = c + i * 2 * ldc;
+#else
+			bb = b + i * ldb;
+			if (transa) {
+				l = k;
+				bb = b + i;
+			}
+			cc = c + i * ldc;
+#endif
+
+#if defined(COMPLEX)
+			if (beta_r != ONE || beta_i != ZERO)
+				SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0,
+				       NULL, 0);
+
+			if (alpha_r == ZERO && alpha_i == ZERO)
+				return;
+#else
+			if (beta != ONE)
+				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
+
+			if (alpha == ZERO)
+				continue;
+#endif
+			IDEBUG_START;
+
+			FUNCTION_PROFILE_START();
+
+			buffer_size = j + k + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+			buffer_size += 160 / sizeof(FLOAT);
+#endif
+			// for alignment
+			buffer_size = (buffer_size + 3) & ~3;
+			STACK_ALLOC(buffer_size, FLOAT, buffer);
+
+#ifdef SMP
+
+			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
+				nthreads = 1;
+			else
+				nthreads = num_cpu_avail(2);
+
+			if (nthreads == 1) {
+#endif
+
+#if defined(COMPLEX)
+				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
+						     a, lda, bb, incb, cc, 1,
+						     buffer);
+#else
+				(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
+						     incb, cc, 1, buffer);
+#endif
+
+#ifdef SMP
+			} else {
+
+				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
+							    bb, incb, cc, 1,
+							    buffer, nthreads);
+
+			}
+#endif
+
+			STACK_FREE(buffer);
+		}
+	}
+	FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
+			     args.m * args.k + args.k * args.n +
+			     args.m * args.n, 2 * args.m * args.n * args.k);
+
+	IDEBUG_END;
+
+	return;
+}

From 4989e039a5b37de140b41df9a042720599336e29 Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Thu, 27 Oct 2022 14:10:26 +0800
Subject: [PATCH 045/154] Define SBGEMM_ALIGN_K for DYNAMIC_ARCH build

---
 common_param.h                |  2 +-
 driver/level3/level3.c        | 11 +++++------
 driver/level3/level3_thread.c | 10 +++++-----
 kernel/setparam-ref.c         |  8 ++------
 param.h                       |  5 +++++
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/common_param.h b/common_param.h
index 091840343..e14ef2782 100644
--- a/common_param.h
+++ b/common_param.h
@@ -50,6 +50,7 @@ typedef struct {
 #ifdef BUILD_BFLOAT16
   int sbgemm_p, sbgemm_q, sbgemm_r;
   int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
+  int sbgemm_align_k;
 
   void   (*sbstobf16_k) (BLASLONG, float    *, BLASLONG, bfloat16 *, BLASLONG);
   void   (*sbdtobf16_k) (BLASLONG, double   *, BLASLONG, bfloat16 *, BLASLONG);
@@ -1193,7 +1194,6 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
 #ifdef BUILD_COMPLEX16
   int    (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); 
 #endif
-  int align_k;  // must be 2^n
 } gotoblas_t;
 
 extern gotoblas_t *gotoblas;
diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index d3281345d..b7328876b 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -305,13 +305,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       }
 
       BLASLONG pad_min_l = min_l;
-      
-#if defined(HALF) && defined(DYNAMIC_ARCH)
-      pad_min_l = (min_l + gotoblas->align_k - 1) & ~(gotoblas->align_k-1);
+#if defined(HALF)
+#if defined(DYNAMIC_ARCH)
+      pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1);
+#else
+      pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);;
 #endif
-
-#if defined(HALF) && !defined(DYNAMIC_ARCH) && defined(NEOVERSEN2)
-      pad_min_l = (min_l + 3) & ~3;
 #endif
 
       /* First, we have to move data A to L2 cache */
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 95c8e6d19..02b60b50d 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -327,12 +327,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
     
     BLASLONG pad_min_l = min_l;
 
-#if defined(HALF) && defined(DYNAMIC_ARCH)
-    pad_min_l = (min_l + gotoblas->align_k - 1) & ~(gotoblas->align_k-1);
+#if defined(HALF)
+#if defined(DYNAMIC_ARCH)
+    pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1);
+#else
+    pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);;
 #endif
-
-#if defined(HALF) && !defined(DYNAMIC_ARCH) && defined(NEOVERSEN2)
-    pad_min_l = (min_l + 3) & ~3;
 #endif
 
     /* Determine step size in m
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index effcf8965..16d19af1b 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -62,6 +62,8 @@ gotoblas_t TABLE_NAME = {
  MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N),
 #endif
 
+  SBGEMM_ALIGN_K,
+
   sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
 
   samax_kTS,  samin_kTS,  smax_kTS,  smin_kTS,
@@ -973,12 +975,6 @@ static void init_parameter(void) {
   TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r;
 #endif
 #endif
-  
-#if defined(NEOVERSEN2) && BUILD_BFLOAT16 == 1
-  TABLE_NAME.align_k = 4;
-#else
-  TABLE_NAME.align_k = 1;
-#endif
 
 }
 #else // (ARCH_ARM64)
diff --git a/param.h b/param.h
index b9b9a55e8..514b13a3a 100644
--- a/param.h
+++ b/param.h
@@ -79,6 +79,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SBGEMM_DEFAULT_P 256
 #define SBGEMM_DEFAULT_R 256
 #define SBGEMM_DEFAULT_Q 256
+#define SBGEMM_ALIGN_K 1  // must be 2^x
+
 #ifdef OPTERON
 
 #define SNUMOPT		4
@@ -3394,6 +3396,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #elif defined(NEOVERSEN2)
 
+#undef SBGEMM_ALIGN_K
+#define SBGEMM_ALIGN_K 4
+
 #undef SBGEMM_DEFAULT_UNROLL_M
 #undef SBGEMM_DEFAULT_UNROLL_N
 #define SBGEMM_DEFAULT_UNROLL_M 8

From e7e3aa29482281edba46a27fcd452d7ed630f46a Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bartoldeman@users.noreply.github.com>
Date: Thu, 27 Oct 2022 17:20:44 -0400
Subject: [PATCH 046/154] x86_64: prevent GCC and Clang from generating FMAs in
 cscal/zscal.

If e.g. -march=haswell is set in CFLAGS, GCC generates FMAs by default, which
is inconsistent with the microkernels, none of which use FMAs. These
inconsistencies cause a few failures in the LAPACK testcases, where
eigenvalue results with/without eigenvectors are compared.

Moreover using FMAs for multiplication of complex numbers can give surprising
results, see 22aa81f for more information.

This uses the same syntax as used in 22aa81f for zarch (s390x).
---
 kernel/x86_64/cscal.c | 13 +++++++++++++
 kernel/x86_64/zscal.c | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c
index dc3f688c6..6ae66d973 100644
--- a/kernel/x86_64/cscal.c
+++ b/kernel/x86_64/cscal.c
@@ -25,6 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
+/*
+ * Avoid contraction of floating point operations, specifically fused
+ * multiply-add, because they can cause unexpected results in complex
+ * multiplication.
+ */
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC optimize ("fp-contract=off")
+#endif
+
+#if defined(__clang__)
+#pragma clang fp contract(off)
+#endif
+
 #include "common.h"
 
 
diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index 3744c98bb..dfdb4230b 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -25,6 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
+/*
+ * Avoid contraction of floating point operations, specifically fused
+ * multiply-add, because they can cause unexpected results in complex
+ * multiplication.
+ */
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC optimize ("fp-contract=off")
+#endif
+
+#if defined(__clang__)
+#pragma clang fp contract(off)
+#endif
+
 #include "common.h"
 
 

From 79066b6bf3c460caedd4ebbef4d17541fc0369bc Mon Sep 17 00:00:00 2001
From: Honglin Zhu <zhuhonglin.zhl@alibaba-inc.com>
Date: Fri, 28 Oct 2022 17:09:39 +0800
Subject: [PATCH 047/154] Change file name to match the norm and delete useless
 code.

---
 kernel/arm64/KERNEL.NEOVERSEN2                |   11 +-
 kernel/arm64/sbgemm_kernel_8x4_neoversen2.c   |   10 +-
 .../arm64/sbgemm_kernel_8x4_neoversen2_impl.c | 1004 +++++++----------
 .../arm64/sbgemm_kernel_neoversen2_newbf16.c  |  467 --------
 kernel/arm64/sbgemm_ncopy_4_neoversen2.c      |   11 -
 kernel/arm64/sbgemm_ncopy_neoversen2.c        |  101 --
 kernel/arm64/sbgemm_tcopy_8_neoversen2.c      |    9 -
 kernel/arm64/sbgemm_tcopy_neoversen2.c        |  109 --
 8 files changed, 415 insertions(+), 1307 deletions(-)
 delete mode 100644 kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c
 delete mode 100644 kernel/arm64/sbgemm_ncopy_neoversen2.c
 delete mode 100644 kernel/arm64/sbgemm_tcopy_neoversen2.c

diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2
index 7fe9acd5c..ae386d6e1 100644
--- a/kernel/arm64/KERNEL.NEOVERSEN2
+++ b/kernel/arm64/KERNEL.NEOVERSEN2
@@ -189,12 +189,11 @@ ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 SBGEMM_BETA    =  sbgemm_beta_neoversen2.c
-# SBGEMMKERNEL    = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c
-SBGEMMKERNEL    = sbgemm_kernel_neoversen2_newbf16.c
-SBGEMMINCOPY    = sbgemm_ncopy_4_neoversen2.c
-SBGEMMITCOPY    = sbgemm_tcopy_8_neoversen2.c
-SBGEMMONCOPY    = sbgemm_ncopy_4_neoversen2.c
-SBGEMMOTCOPY    = sbgemm_tcopy_8_neoversen2.c
+SBGEMMKERNEL    = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c
+SBGEMMINCOPY    = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c
+SBGEMMITCOPY    = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c
+SBGEMMONCOPY    = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c
+SBGEMMOTCOPY    = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c
 SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
 SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c b/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c
index 66e7dd38a..4c1385fbe 100644
--- a/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c
+++ b/kernel/arm64/sbgemm_kernel_8x4_neoversen2.c
@@ -37,9 +37,9 @@
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B,
           FLOAT *C, BLASLONG ldc) {
-    if (alpha == 1.0f)
-        return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc);
-    else
-        return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc);
-    return 0;
+  if (alpha == 1.0f)
+    return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc);
+  else
+    return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc);
+  return 0;
 }
diff --git a/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c b/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c
index 7d53b1aa0..26ea7ee61 100644
--- a/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c
+++ b/kernel/arm64/sbgemm_kernel_8x4_neoversen2_impl.c
@@ -30,636 +30,442 @@
 
 #include "common.h"
 
+#define INIT_C(M, N) mc##M##N = svdup_f32(0);
+
+#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N);
+
+#define INIT_C_8x4 \
+  do {             \
+    INIT_C(0, 0);  \
+    INIT_C(0, 1);  \
+    INIT_C(1, 0);  \
+    INIT_C(1, 1);  \
+    INIT_C(2, 0);  \
+    INIT_C(2, 1);  \
+    INIT_C(3, 0);  \
+    INIT_C(3, 1);  \
+  } while (0);
+
 #ifdef ALPHA_ONE
-#define LOAD_C(M, N) \
-    mc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc);
+#define UPDATE_C(PG, PTR, DST, SRC) \
+  do {                              \
+    DST = svld1_f32((PG), (PTR));   \
+    DST = svadd_z((PG), SRC, DST);  \
+    svst1_f32((PG), (PTR), DST);    \
+  } while (0);
+#else
+#define UPDATE_C(PG, PTR, DST, SRC)         \
+  do {                                      \
+    DST = svld1_f32((PG), (PTR));           \
+    DST = svmad_z((PG), svalpha, SRC, DST); \
+    svst1_f32((PG), (PTR), DST);            \
+  } while (0);
+#endif
 
-#define LOAD_C_LOW(M, N) \
-    mc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M, off_vc);
+#ifdef ALPHA_ONE
+int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
+#else
+int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
+#endif
+{
+  BLASLONG pad_k = (k + 3) & ~3;
+
+  svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1;
+  svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, 
+              vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7, 
+              oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7;
+  svfloat32_t svalpha = svdup_f32(alpha);
+
+  svbool_t pg16 = svptrue_b16();
+  svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
+  svbool_t pg32 = svptrue_b32();
+  svbool_t pg32_low = svdupq_b32(1, 1, 0, 0);
+  svbool_t pg32_first = svdupq_b32(1, 0, 0, 0);
+
+  bfloat16_t *ptr_a = (bfloat16_t *)A;
+  bfloat16_t *ptr_b = (bfloat16_t *)B;
+  FLOAT *ptr_c = C;
+
+  bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3;
+  bfloat16_t *ptr_b0, *ptr_b1;
+  FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3;
+
+  for (BLASLONG j = 0; j < n / 4; j++) {
+    ptr_c0 = ptr_c;
+    ptr_c1 = ptr_c0 + ldc;
+    ptr_c2 = ptr_c1 + ldc;
+    ptr_c3 = ptr_c2 + ldc;
+    ptr_c += 4 * ldc;
+    ptr_a = (bfloat16_t *)A;
+
+    for (BLASLONG i = 0; i < m / 8; i++) {
+      ptr_a0 = ptr_a;
+      ptr_a += 8 * pad_k;
+
+      ptr_b0 = ptr_b;
+
+      INIT_C_8x4;
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        ma2 = svld1_bf16(pg16, ptr_a0 + 16);
+        ma3 = svld1_bf16(pg16, ptr_a0 + 24);
+
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
+
+        MATMUL(0, 0); MATMUL(0, 1);
+        MATMUL(1, 0); MATMUL(1, 1);
+        MATMUL(2, 0); MATMUL(2, 1);
+        MATMUL(3, 0); MATMUL(3, 1);
+
+        ptr_a0 += 32;
+        ptr_b0 += 16;
+      }
+
+      vc0 = svuzp1(mc00, mc10);
+      vc1 = svuzp1(mc20, mc30);
+      vc2 = svuzp2(mc00, mc10);
+      vc3 = svuzp2(mc20, mc30);
+      vc4 = svuzp1(mc01, mc11);
+      vc5 = svuzp1(mc21, mc31);
+      vc6 = svuzp2(mc01, mc11);
+      vc7 = svuzp2(mc21, mc31);
+
+      UPDATE_C(pg32, ptr_c0, oc0, vc0);
+      UPDATE_C(pg32, ptr_c0+4, oc1, vc1);
+      UPDATE_C(pg32, ptr_c1, oc2, vc2);
+      UPDATE_C(pg32, ptr_c1+4, oc3, vc3);
+      UPDATE_C(pg32, ptr_c2, oc4, vc4)
+      UPDATE_C(pg32, ptr_c2+4, oc5, vc5);
+      UPDATE_C(pg32, ptr_c3, oc6, vc6)
+      UPDATE_C(pg32, ptr_c3+4, oc7, vc7);
+
+      ptr_c0 += 8;
+      ptr_c1 += 8;
+      ptr_c2 += 8;
+      ptr_c3 += 8;
+    }
 
-#define LOAD_C_EVEN(M, N) \
-    mc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M, off_vc);
+    if (m & 4) {
+      ptr_a0 = ptr_a;
+      ptr_a += 4 * pad_k;
+      ptr_b0 = ptr_b;
+
+      INIT_C(0, 0); INIT_C(0, 1);
+      INIT_C(1, 0); INIT_C(1, 1);
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
+
+        MATMUL(0, 0); MATMUL(0, 1);
+        MATMUL(1, 0); MATMUL(1, 1);
+
+        ptr_a0 += 16;
+        ptr_b0 += 16;
+      }
+
+      vc0 = svuzp1(mc00, mc10);
+      vc1 = svuzp2(mc00, mc10);
+      vc2 = svuzp1(mc01, mc11);
+      vc3 = svuzp2(mc01, mc11);
+
+      UPDATE_C(pg32, ptr_c0, oc0, vc0);
+      UPDATE_C(pg32, ptr_c1, oc1, vc1);
+      UPDATE_C(pg32, ptr_c2, oc2, vc2);
+      UPDATE_C(pg32, ptr_c3, oc3, vc3);
+
+      ptr_c0 += 4;
+      ptr_c1 += 4;
+      ptr_c2 += 4;
+      ptr_c3 += 4;
+    }
 
-#define LOAD_C_FIRST(M, N) \
-    mc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M, off_vc);
+    if (m & 2) {
+      ptr_a0 = ptr_a;
+      ptr_a += 2 * pad_k;
+      ptr_b0 = ptr_b;
+
+      INIT_C(0, 0); INIT_C(0, 1);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
+
+        MATMUL(0, 0); MATMUL(0, 1);
+
+        ptr_a0 += 8;
+        ptr_b0 += 16;
+      }
+
+      vc0 = svuzp1(mc00, mc00);
+      vc1 = svuzp2(mc00, mc00);
+      vc2 = svuzp1(mc01, mc01);
+      vc3 = svuzp2(mc01, mc01);
+
+      UPDATE_C(pg32_low, ptr_c0, oc0, vc0);
+      UPDATE_C(pg32_low, ptr_c1, oc1, vc1);
+      UPDATE_C(pg32_low, ptr_c2, oc2, vc2);
+      UPDATE_C(pg32_low, ptr_c3, oc3, vc3);
+
+      ptr_c0 += 2;
+      ptr_c1 += 2;
+      ptr_c2 += 2;
+      ptr_c3 += 2;
+    }
 
-#define STORE_C(M, N) \
-    svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N);
+    if (m & 1) {
+      ptr_a0 = ptr_a;
+      ptr_b0 = ptr_b;
 
-#define STORE_C_LOW(M, N) \
-    svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N);
+      INIT_C(0, 0); INIT_C(0, 1);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16_low, ptr_a0);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
 
-#define STORE_C_EVEN(M, N) \
-    svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N);
+        MATMUL(0, 0); MATMUL(0, 1);
 
-#define STORE_C_FIRST(M, N) \
-    svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N);
+        ptr_a0 += 4;
+        ptr_b0 += 16;
+      }
 
-#else
-#define LOAD_C(M, N)  \
-    mc##M##N = svdup_f32(0);  \
-    oc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc);
+      vc1 = svuzp2(mc00, mc00);
+      vc3 = svuzp2(mc01, mc01);
 
-#define LOAD_C_LOW(M, N)  \
-    mc##M##N = svdup_f32(0);  \
-    oc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M , off_vc);
+      UPDATE_C(pg32_first, ptr_c0, oc0, mc00);
+      UPDATE_C(pg32_first, ptr_c1, oc1, vc1);
+      UPDATE_C(pg32_first, ptr_c2, oc2, mc01);
+      UPDATE_C(pg32_first, ptr_c3, oc3, vc3);
 
-#define LOAD_C_EVEN(M, N)  \
-    mc##M##N = svdup_f32(0);  \
-    oc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M , off_vc);
+    }
 
-#define LOAD_C_FIRST(M, N)  \
-    mc##M##N = svdup_f32(0);  \
-    oc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M , off_vc);
+    ptr_b += 4 * pad_k;
+  }
 
-#define STORE_C(M, N) \
-    mc##M##N = svmad_z(pg32, svalpha, mc##M##N, oc##M##N);  \
-    svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N);
+  if (n & 2) {
+    ptr_c0 = ptr_c;
+    ptr_c1 = ptr_c0 + ldc;
+    ptr_c += 2 * ldc;
+    ptr_a = (bfloat16_t *)A;
 
-#define STORE_C_LOW(M, N) \
-    mc##M##N = svmad_z(pg32_low, svalpha, mc##M##N, oc##M##N);  \
-    svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N);
+    for (BLASLONG i = 0; i < m / 8; i++) {
+      ptr_a0 = ptr_a;
+      ptr_a += 8 * pad_k;
 
-#define STORE_C_EVEN(M, N) \
-    mc##M##N = svmad_z(pg32_even, svalpha, mc##M##N, oc##M##N);  \
-    svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N);
+      ptr_b0 = ptr_b;
 
-#define STORE_C_FIRST(M, N) \
-    mc##M##N = svmad_z(pg32_first, svalpha, mc##M##N, oc##M##N);  \
-    svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N);
+      INIT_C(0, 0);
+      INIT_C(1, 0);
+      INIT_C(2, 0);
+      INIT_C(3, 0);
 
-#endif
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        ma2 = svld1_bf16(pg16, ptr_a0 + 16);
+        ma3 = svld1_bf16(pg16, ptr_a0 + 24);
 
-#define LOAD_A(M) ma##M = svld1_bf16(pg16, ptr_a##M);
+        mb0 = svld1_bf16(pg16, ptr_b0);
 
-#define LOAD_B(N) mb##N = svld1_bf16(pg16, ptr_b##N);
+        MATMUL(0, 0);
+        MATMUL(1, 0);
+        MATMUL(2, 0);
+        MATMUL(3, 0);
 
-#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N);
+        ptr_a0 += 32;
+        ptr_b0 += 8;
+      }
 
-#define LOAD_KREST_1(NAME, M)                                    \
-    m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, \
-                             *(ptr_##NAME##M + 1), zero, zero, zero);
+      vc0 = svuzp1(mc00, mc10);
+      vc1 = svuzp1(mc20, mc30);
+      vc2 = svuzp2(mc00, mc10);
+      vc3 = svuzp2(mc20, mc30);
 
-#define LOAD_KREST_1_LOW(NAME, M)                                            \
-    m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, zero, zero, \
-                             zero, zero);
+      UPDATE_C(pg32, ptr_c0, oc0, vc0);
+      UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1);
+      UPDATE_C(pg32, ptr_c1, oc2, vc2);
+      UPDATE_C(pg32, ptr_c1 + 4, oc3, vc3);
 
-#define LOAD_KREST_2(NAME, M)                                           \
-    m##NAME##M =                                                        \
-        svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, zero, \
-                    *(ptr_##NAME##M + 2), *(ptr_##NAME##M + 3), zero, zero);
+      ptr_c0 += 8;
+      ptr_c1 += 8;
+    }
 
-#define LOAD_KREST_2_LOW(NAME, M)                                          \
-    m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, \
-                             zero, zero, zero, zero, zero);
+    if (m & 4) {
+      ptr_a0 = ptr_a;
+      ptr_a += 4 * pad_k;
+      ptr_b0 = ptr_b;
 
-#define LOAD_KREST_3(NAME, M)                                         \
-    m##NAME##M =                                                      \
-        svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1),           \
-                    *(ptr_##NAME##M + 2), zero, *(ptr_##NAME##M + 3), \
-                    *(ptr_##NAME##M + 4), *(ptr_##NAME##M + 5), zero);
+      INIT_C(0, 0);
+      INIT_C(1, 0);
 
-#define LOAD_KREST_3_LOW(NAME, M)                           \
-    m##NAME##M =                                            \
-        svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \
-                    *(ptr_##NAME##M + 2), zero, zero, zero, zero, zero);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        MATMUL(0, 0);
+        MATMUL(1, 0);
+        ptr_a0 += 16;
+        ptr_b0 += 8;
+      }
 
+      vc0 = svuzp1(mc00, mc10);
+      vc1 = svuzp2(mc00, mc10);
+
+      UPDATE_C(pg32, ptr_c0, oc0, vc0);
+      UPDATE_C(pg32, ptr_c1, oc1, vc1);
+
+      ptr_c0 += 4;
+      ptr_c1 += 4;
+    }
+
+    if (m & 2) {
+      ptr_a0 = ptr_a;
+      ptr_a += 2 * pad_k;
+      ptr_b0 = ptr_b;
+
+      INIT_C(0, 0);
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+
+        MATMUL(0, 0);
+
+        ptr_a0 += 8;
+        ptr_b0 += 8;
+      }
+
+      vc0 = svuzp1(mc00, mc00);
+      vc1 = svuzp2(mc00, mc00);
+      UPDATE_C(pg32_low, ptr_c0, oc0, vc0);
+      UPDATE_C(pg32_low, ptr_c1, oc1, vc1);
+
+      ptr_c0 += 2;
+      ptr_c1 += 2;
 
-#ifdef ALPHA_ONE
-int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
-#else
-int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
-#endif
-{
-    bfloat16_t *ptr_a = (bfloat16_t *)A;
-    bfloat16_t *ptr_b = (bfloat16_t *)B;
-    FLOAT *ptr_c = C;
-
-    bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3;
-    bfloat16_t *ptr_b0, *ptr_b1;
-    FLOAT *ptr_c00, *ptr_c01;
-
-    svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1;
-    svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31;
-#ifndef ALPHA_ONE
-    svfloat32_t oc00, oc01, oc10, oc11, oc20, oc21, oc30, oc31;
-#endif
-    svbool_t pg16 = svptrue_b16();
-    svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
-    svbool_t pg32 = svptrue_b32();
-    svbool_t pg32_low = svdupq_b32(1, 1, 0, 0);
-    svbool_t pg32_even = svdupq_b32(1, 0, 1, 0);
-    svbool_t pg32_first = svdupq_b32(1, 0, 0, 0);
-    svfloat32_t svalpha = svdup_f32(alpha);
-    bfloat16 tmp = 0;
-    bfloat16_t zero = *((bfloat16_t *)&tmp);
-    BLASLONG krest = k & 3;
-
-    // 00 01 10 11
-    svuint32_t off_vc = svdupq_u32(0, (uint32_t)ldc, 1, (uint32_t)ldc + 1);
-
-    for (BLASLONG j = 0; j < n / 4; j++) {
-        ptr_c00 = ptr_c;
-        ptr_c01 = ptr_c + 2 * ldc;
-        ptr_c += 4 * ldc;
-
-        ptr_a = (bfloat16_t *)A;
-
-        for (BLASLONG i = 0; i < m / 8; i++) {
-            ptr_a0 = ptr_a;
-            ptr_a1 = ptr_a0 + 2 * k;
-            ptr_a2 = ptr_a1 + 2 * k;
-            ptr_a3 = ptr_a2 + 2 * k;
-            ptr_a += 8 * k;
-
-            ptr_b0 = ptr_b;
-            ptr_b1 = ptr_b0 + 2 * k;
-
-            LOAD_C(0, 0); LOAD_C(0, 1);
-            LOAD_C(1, 0); LOAD_C(1, 1);
-            LOAD_C(2, 0); LOAD_C(2, 1);
-            LOAD_C(3, 0); LOAD_C(3, 1);
-
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
-                LOAD_B(0); LOAD_B(1);
-
-                MATMUL(0, 0); MATMUL(0, 1);
-                MATMUL(1, 0); MATMUL(1, 1);
-                MATMUL(2, 0); MATMUL(2, 1);
-                MATMUL(3, 0); MATMUL(3, 1);
-
-                ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
-                ptr_b0 += 8; ptr_b1 += 8;
-            }
-
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
-                    LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
-                    LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
-                } else if (krest == 2) {
-                    LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
-                    LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
-                    LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
-                } else if (krest == 3) {
-                    LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
-                    LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
-                    LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
-                }
-                MATMUL(0, 0); MATMUL(0, 1);
-                MATMUL(1, 0); MATMUL(1, 1);
-                MATMUL(2, 0); MATMUL(2, 1);
-                MATMUL(3, 0); MATMUL(3, 1);
-            }
-
-            STORE_C(0, 0); STORE_C(0, 1);
-            STORE_C(1, 0); STORE_C(1, 1);
-            STORE_C(2, 0); STORE_C(2, 1);
-            STORE_C(3, 0); STORE_C(3, 1);
-
-            ptr_c00 += 8; ptr_c01 += 8;
-        }
-
-        if (m & 4) {
-            ptr_a0 = ptr_a;
-            ptr_a1 = ptr_a0 + 2 * k;
-            ptr_a += 4 * k;
-
-            ptr_b0 = ptr_b;
-            ptr_b1 = ptr_b0 + 2 * k;
-
-            LOAD_C(0, 0); LOAD_C(0, 1);
-            LOAD_C(1, 0); LOAD_C(1, 1);
-
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                LOAD_A(0); LOAD_A(1);
-                LOAD_B(0); LOAD_B(1);
-
-                MATMUL(0, 0); MATMUL(0, 1);
-                MATMUL(1, 0); MATMUL(1, 1);
-
-                ptr_a0 += 8; ptr_a1 += 8;
-                ptr_b0 += 8; ptr_b1 += 8;
-            }
-
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
-                    LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
-                } else if (krest == 2) {
-                    LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
-                    LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
-                } else if (krest == 3) {
-                    LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
-                    LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
-                }
-                MATMUL(0, 0); MATMUL(0, 1);
-                MATMUL(1, 0); MATMUL(1, 1);
-            }
-            
-            STORE_C(0, 0); STORE_C(0, 1);
-            STORE_C(1, 0); STORE_C(1, 1);
-
-            ptr_c00 += 4; ptr_c01 += 4;
-        }
-
-        if (m & 2) {
-            ptr_a0 = ptr_a;
-            ptr_a += 2 * k;
-
-            ptr_b0 = ptr_b;
-            ptr_b1 = ptr_b0 + 2 * k;
-
-            LOAD_C(0, 0); LOAD_C(0, 1);
-
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                LOAD_A(0);
-                LOAD_B(0); LOAD_B(1);
-
-                MATMUL(0, 0); MATMUL(0, 1);
-
-                ptr_a0 += 8;
-                ptr_b0 += 8; ptr_b1 += 8;
-            }
-
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1(a, 0);
-                    LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
-                } else if (krest == 2) {
-                    LOAD_KREST_2(a, 0);
-                    LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
-                } else if (krest == 3) {
-                    LOAD_KREST_3(a, 0);
-                    LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
-                }
-                MATMUL(0, 0); MATMUL(0, 1);
-            }
-            STORE_C(0, 0); STORE_C(0, 1);
-            ptr_c00 += 2; ptr_c01 += 2;
-        }
-
-        if (m & 1) {
-            ptr_a0 = ptr_a;
-
-            ptr_b0 = ptr_b;
-            ptr_b1 = ptr_b0 + 2 * k;
-
-            LOAD_C_LOW(0, 0); LOAD_C_LOW(0, 1);
-
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                ma0 = svld1_bf16(pg16_low, ptr_a0);
-                LOAD_B(0); LOAD_B(1);
-
-                MATMUL(0, 0); MATMUL(0, 1);
-
-                ptr_a0 += 4;
-                ptr_b0 += 8;
-                ptr_b1 += 8;
-            }
-
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1_LOW(a, 0);
-                    LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
-                } else if (krest == 2) {
-                    LOAD_KREST_2_LOW(a, 0);
-                    LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
-                } else if (krest == 3) {
-                    LOAD_KREST_3_LOW(a, 0);
-                    LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
-                }
-                MATMUL(0, 0); MATMUL(0, 1);
-            }
-            STORE_C_LOW(0, 0); STORE_C_LOW(0, 1);
-        }
-
-        ptr_b += 4 * k;
     }
 
-    if (n & 2) {
-        ptr_c00 = ptr_c;
-        ptr_c += 2 * ldc;
-
-        ptr_a = (bfloat16_t *)A;
-
-        for (BLASLONG i = 0; i < m / 8; i++) {
-            ptr_a0 = ptr_a;
-            ptr_a1 = ptr_a0 + 2 * k;
-            ptr_a2 = ptr_a1 + 2 * k;
-            ptr_a3 = ptr_a2 + 2 * k;
-            ptr_a += 8 * k;
-
-            ptr_b0 = ptr_b;
-
-            LOAD_C(0, 0);
-            LOAD_C(1, 0);
-            LOAD_C(2, 0);
-            LOAD_C(3, 0);
-
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
-                LOAD_B(0);
-
-                MATMUL(0, 0);
-                MATMUL(1, 0);
-                MATMUL(2, 0);
-                MATMUL(3, 0);
-
-                ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
-                ptr_b0 += 8;
-            }
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
-                    LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
-                    LOAD_KREST_1(b, 0);
-                } else if (krest == 2) {
-                    LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
-                    LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
-                    LOAD_KREST_2(b, 0);
-                } else if (krest == 3) {
-                    LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
-                    LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
-                    LOAD_KREST_3(b, 0);
-                }
-                MATMUL(0, 0);
-                MATMUL(1, 0);
-                MATMUL(2, 0);
-                MATMUL(3, 0);
-            }
-            
-            STORE_C(0, 0);
-            STORE_C(1, 0);
-            STORE_C(2, 0);
-            STORE_C(3, 0);
-
-            ptr_c00 += 8;
-        }
-
-        if (m & 4) {
-            ptr_a0 = ptr_a;
-            ptr_a1 = ptr_a0 + 2 * k;
-            ptr_a += 4 * k;
-
-            ptr_b0 = ptr_b;
-
-            LOAD_C(0, 0);
-            LOAD_C(1, 0);
-
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                LOAD_A(0); LOAD_A(1);
-                LOAD_B(0);
-
-                MATMUL(0, 0);
-                MATMUL(1, 0);
-
-                ptr_a0 += 8; ptr_a1 += 8;
-                ptr_b0 += 8;
-            }
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
-                    LOAD_KREST_1(b, 0);
-                } else if (krest == 2) {
-                    LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
-                    LOAD_KREST_2(b, 0);
-                } else if (krest == 3) {
-                    LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
-                    LOAD_KREST_3(b, 0);
-                }
-                MATMUL(0, 0);
-                MATMUL(1, 0);
-            }
-            STORE_C(0, 0)
-            STORE_C(1, 0)
-
-            ptr_c00 += 4;
-        }
-        
-        if (m & 2) {
-            ptr_a0 = ptr_a;
-            ptr_a += 2 * k;
-            ptr_b0 = ptr_b;
-
-            LOAD_C(0, 0);
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                LOAD_A(0);
-                LOAD_B(0);
-                MATMUL(0, 0);
-                ptr_a0 += 8;
-                ptr_b0 += 8;
-            }
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1(a, 0);
-                    LOAD_KREST_1(b, 0);
-                } else if (krest == 2) {
-                    LOAD_KREST_2(a, 0);
-                    LOAD_KREST_2(b, 0);
-                } else if (krest == 3) {
-                    LOAD_KREST_3(a, 0);
-                    LOAD_KREST_3(b, 0);
-                }
-                MATMUL(0, 0);
-            }
-            STORE_C(0, 0);
-            ptr_c00 += 2;
-        }
-
-        if (m & 1) {
-            ptr_a0 = ptr_a;
-
-            ptr_b0 = ptr_b;
-
-            LOAD_C(0, 0);
-
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                ma0 = svld1_bf16(pg16_low, ptr_a0);
-                LOAD_B(0);
-                MATMUL(0, 0);
-                ptr_a0 += 4;
-                ptr_b0 += 8;
-            }
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1_LOW(a, 0);
-                    LOAD_KREST_1(b, 0);
-                } else if (krest == 2) {
-                    LOAD_KREST_2_LOW(a, 0);
-                    LOAD_KREST_2(b, 0);
-                } else if (krest == 3) {
-                    LOAD_KREST_3_LOW(a, 0);
-                    LOAD_KREST_3(b, 0);
-                }
-                MATMUL(0, 0);
-            }
-            STORE_C_LOW(0, 0);
-        }
-        
-        ptr_b += 2 * k;
+    if (m & 1) {
+      ptr_a0 = ptr_a;
+      ptr_b0 = ptr_b;
+      INIT_C(0, 0);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16_low, ptr_a0);
+        mb0 = svld1_bf16(pg16, ptr_b0);
+        MATMUL(0, 0);
+        ptr_a0 += 4;
+        ptr_b0 += 8;
+      }
+      vc1 = svuzp2(mc00, mc00);
+
+      UPDATE_C(pg32_first, ptr_c0, oc0, mc00);
+      UPDATE_C(pg32_first, ptr_c1, oc1, vc1);
+    }
+
+    ptr_b += 2 * pad_k;
+  }
+
+  if (n & 1) {
+    ptr_c0 = ptr_c;
+    ptr_a = (bfloat16_t *)A;
+
+    for (BLASLONG i = 0; i < m / 8; i++) {
+      ptr_a0 = ptr_a;
+      ptr_a += 8 * pad_k;
+
+      ptr_b0 = ptr_b;
+
+      INIT_C(0, 0);
+      INIT_C(1, 0);
+      INIT_C(2, 0);
+      INIT_C(3, 0);
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        ma2 = svld1_bf16(pg16, ptr_a0 + 16);
+        ma3 = svld1_bf16(pg16, ptr_a0 + 24);
+
+        mb0 = svld1_bf16(pg16_low, ptr_b0);
+
+        MATMUL(0, 0);
+        MATMUL(1, 0);
+        MATMUL(2, 0);
+        MATMUL(3, 0);
+
+        ptr_a0 += 32;
+        ptr_b0 += 4;
+      }
+
+      vc0 = svuzp1(mc00, mc10);
+      vc1 = svuzp1(mc20, mc30);
+
+      UPDATE_C(pg32, ptr_c0, oc0, vc0);
+      UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1);
+
+      ptr_c0 += 8;
+    }
+
+    if (m & 4) {
+      ptr_a0 = ptr_a;
+      ptr_a += 4 * pad_k;
+      ptr_b0 = ptr_b;
+      INIT_C(0, 0);
+      INIT_C(1, 0);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
+        mb0 = svld1_bf16(pg16_low, ptr_b0);
+        MATMUL(0, 0);
+        MATMUL(1, 0);
+        ptr_a0 += 16;
+        ptr_b0 += 4;
+      }
+      vc0 = svuzp1(mc00, mc10);
+      UPDATE_C(pg32, ptr_c0, oc0, vc0);
+      ptr_c0 += 4;
+    }
+
+    if (m & 2) {
+      ptr_a0 = ptr_a;
+      ptr_a += 2 * pad_k;
+      ptr_b0 = ptr_b;
+
+      INIT_C(0, 0);
+
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16, ptr_a0);
+        mb0 = svld1_bf16(pg16_low, ptr_b0);
+
+        MATMUL(0, 0);
+
+        ptr_a0 += 8;
+        ptr_b0 += 4;
+      }
+      vc0 = svuzp1(mc00, mc00);
+      UPDATE_C(pg32_low, ptr_c0, oc0, vc0);
+      ptr_c0 += 2;
     }
 
-    if (n & 1) {
-        ptr_c00 = ptr_c;
-        ptr_a = (bfloat16_t *) A;
-
-        for (BLASLONG i = 0; i < m / 8; i++) {
-            ptr_a0 = ptr_a;
-            ptr_a1 = ptr_a0 + 2 * k;
-            ptr_a2 = ptr_a1 + 2 * k;
-            ptr_a3 = ptr_a2 + 2 * k;
-            ptr_a += 8 * k;
-
-            ptr_b0 = ptr_b;
-
-            LOAD_C_EVEN(0, 0);
-            LOAD_C_EVEN(1, 0);
-            LOAD_C_EVEN(2, 0);
-            LOAD_C_EVEN(3, 0);
-
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
-                mb0 = svld1_bf16(pg16_low, ptr_b0);
-
-                MATMUL(0, 0);
-                MATMUL(1, 0);
-                MATMUL(2, 0);
-                MATMUL(3, 0);
-
-                ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
-                ptr_b0 += 4;
-            }
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
-                    LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
-                    LOAD_KREST_1_LOW(b, 0);
-                } else if (krest == 2) {
-                    LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
-                    LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
-                    LOAD_KREST_2_LOW(b, 0);
-                } else if (krest == 3) {
-                    LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
-                    LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
-                    LOAD_KREST_3_LOW(b, 0);
-                }
-                MATMUL(0, 0);
-                MATMUL(1, 0);
-                MATMUL(2, 0);
-                MATMUL(3, 0);
-            }
-            STORE_C_EVEN(0, 0)
-            STORE_C_EVEN(1, 0);
-            STORE_C_EVEN(2, 0);
-            STORE_C_EVEN(3, 0);
-
-            ptr_c00 += 8;
-        }
-        
-        if (m & 4) {
-            ptr_a0 = ptr_a;
-            ptr_a1 = ptr_a0 + 2 * k;
-            ptr_a += 4 * k;
-
-            ptr_b0 = ptr_b;
-
-            LOAD_C_EVEN(0, 0);
-            LOAD_C_EVEN(1, 0);
-
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                LOAD_A(0); LOAD_A(1);
-                mb0 = svld1_bf16(pg16_low, ptr_b0);
-
-                MATMUL(0, 0);
-                MATMUL(1, 0);
-
-                ptr_a0 += 8; ptr_a1 += 8;
-                ptr_b0 += 4;
-            }
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
-                    LOAD_KREST_1_LOW(b, 0);
-                } else if (krest == 2) {
-                    LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
-                    LOAD_KREST_2_LOW(b, 0);
-                } else if (krest == 3) {
-                    LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
-                    LOAD_KREST_3_LOW(b, 0);
-                }
-                MATMUL(0, 0);
-                MATMUL(1, 0);
-            }
-            STORE_C_EVEN(0, 0)
-            STORE_C_EVEN(1, 0)
-
-            ptr_c00 += 4;
-        }
-
-        if (m & 2) {
-            ptr_a0 = ptr_a;
-            ptr_a += 2 * k;
-
-            ptr_b0 = ptr_b;
-            
-            LOAD_C_EVEN(0, 0);
-
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                LOAD_A(0);
-                mb0 = svld1_bf16(pg16_low, ptr_b0);
-
-                MATMUL(0, 0);
-
-                ptr_a0 += 8;
-                ptr_b0 += 4;
-            }
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1(a, 0);
-                    LOAD_KREST_1_LOW(b, 0);
-                } else if (krest == 2) {
-                    LOAD_KREST_2(a, 0);
-                    LOAD_KREST_2_LOW(b, 0);
-                } else if (krest == 3) {
-                    LOAD_KREST_3(a, 0);
-                    LOAD_KREST_3_LOW(b, 0);
-                }
-                MATMUL(0, 0);
-            }
-            STORE_C_EVEN(0, 0);
-            ptr_c00 += 2;
-        }
-        if (m & 1) {
-            ptr_a0 = ptr_a;
-            ptr_b0 = ptr_b;
-            LOAD_C_FIRST(0, 0);
-            for (BLASLONG p = 0; p < k / 4; p++) {
-                ma0 = svld1_bf16(pg16_low, ptr_a0);
-                mb0 = svld1_bf16(pg16_low, ptr_b0);
-
-                MATMUL(0, 0);
-
-                ptr_a0 += 4;
-                ptr_b0 += 4;
-            }
-            if (krest) {
-                if (krest == 1) {
-                    LOAD_KREST_1_LOW(a, 0);
-                    LOAD_KREST_1_LOW(b, 0);
-                } else if (krest == 2) {
-                    LOAD_KREST_2_LOW(a, 0);
-                    LOAD_KREST_2_LOW(b, 0);
-                } else if (krest == 3) {
-                    LOAD_KREST_3_LOW(a, 0);
-                    LOAD_KREST_3_LOW(b, 0);
-                }
-                MATMUL(0, 0);
-            }
-            STORE_C_FIRST(0, 0);
-        }
+    if (m & 1) {
+      ptr_a0 = ptr_a;
+      ptr_b0 = ptr_b;
+      INIT_C(0, 0);
+      for (BLASLONG p = 0; p < pad_k; p += 4) {
+        ma0 = svld1_bf16(pg16_low, ptr_a0);
+        mb0 = svld1_bf16(pg16_low, ptr_b0);
+        MATMUL(0, 0);
+        ptr_a0 += 4;
+        ptr_b0 += 4;
+      }
+      UPDATE_C(pg32_first, ptr_c0, oc0, mc00);
     }
+  }
 
-    return 0;
-}
\ No newline at end of file
+  return 0;
+}
diff --git a/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c b/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c
deleted file mode 100644
index 1bf743c7f..000000000
--- a/kernel/arm64/sbgemm_kernel_neoversen2_newbf16.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/***************************************************************************
- * Copyright (c) 2022, The OpenBLAS Project
- * All rights reserved.
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * 3. Neither the name of the OpenBLAS project nor the names of
- * its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * *****************************************************************************/
-
-#include <arm_sve.h>
-
-#include "common.h"
-
-#define LOAD_C(M, N) mc##M##N = svdup_f32(0);
-
-#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N);
-
-#define LOAD_C_8x4 \
-  do {             \
-    LOAD_C(0, 0);  \
-    LOAD_C(0, 1);  \
-    LOAD_C(1, 0);  \
-    LOAD_C(1, 1);  \
-    LOAD_C(2, 0);  \
-    LOAD_C(2, 1);  \
-    LOAD_C(3, 0);  \
-    LOAD_C(3, 1);  \
-  } while (0);
-
-#define STORE_C(PG, PTR, SRC, DST)          \
-  do {                                      \
-    SRC = svld1_f32((PG), (PTR));           \
-    DST = svmad_z((PG), svalpha, DST, SRC); \
-    svst1_f32((PG), (PTR), DST);            \
-  } while (0);
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B,
-          FLOAT *C, BLASLONG ldc) {
-  BLASLONG pad_k = (k + 3) & ~3;
-
-  svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1;
-  svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31, 
-    vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7,
-    oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7;
-  svfloat32_t svalpha = svdup_f32(alpha);
-  
-  svbool_t pg16 = svptrue_b16();
-  svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
-  svbool_t pg32 = svptrue_b32();
-  svbool_t pg32_low = svdupq_b32(1, 1, 0, 0);
-  svbool_t pg32_first = svdupq_b32(1, 0, 0, 0);
-
-  bfloat16_t *ptr_a = (bfloat16_t *)A;
-  bfloat16_t *ptr_b = (bfloat16_t *)B;
-  FLOAT *ptr_c = C;
-
-  bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3;
-  bfloat16_t *ptr_b0, *ptr_b1;
-  FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3;
-
-  for (BLASLONG j = 0; j < n / 4; j++) {
-    ptr_c0 = ptr_c;
-    ptr_c1 = ptr_c0 + ldc;
-    ptr_c2 = ptr_c1 + ldc;
-    ptr_c3 = ptr_c2 + ldc;
-    ptr_c += 4 * ldc;
-    ptr_a = (bfloat16_t *)A;
-
-    for (BLASLONG i = 0; i < m / 8; i++) {
-      ptr_a0 = ptr_a;
-      ptr_a += 8 * pad_k;
-
-      ptr_b0 = ptr_b;
-
-      LOAD_C_8x4;
-
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16, ptr_a0);
-        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
-        ma2 = svld1_bf16(pg16, ptr_a0 + 16);
-        ma3 = svld1_bf16(pg16, ptr_a0 + 24);
-
-        mb0 = svld1_bf16(pg16, ptr_b0);
-        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
-
-#if 0
-        for (int q = 0; q < 8; q++) {
-          float tmp = 0;
-          *((bfloat16_t *)(&tmp) + 1) = ptr_b0[8+q];
-          printf("%.1f ", tmp);
-        }
-        printf("\n");
-#endif
-
-        MATMUL(0, 0); MATMUL(0, 1);
-        MATMUL(1, 0); MATMUL(1, 1);
-        MATMUL(2, 0); MATMUL(2, 1);
-        MATMUL(3, 0); MATMUL(3, 1);
-
-        ptr_a0 += 32;
-        ptr_b0 += 16;
-      }
-
-      vc0 = svuzp1(mc00, mc10);
-      vc1 = svuzp1(mc20, mc30);
-      vc2 = svuzp2(mc00, mc10);
-      vc3 = svuzp2(mc20, mc30);
-      vc4 = svuzp1(mc01, mc11);
-      vc5 = svuzp1(mc21, mc31);
-      vc6 = svuzp2(mc01, mc11);
-      vc7 = svuzp2(mc21, mc31);
-
-      STORE_C(pg32, ptr_c0, oc0, vc0);
-      STORE_C(pg32, ptr_c0+4, oc1, vc1);
-      STORE_C(pg32, ptr_c1, oc2, vc2);
-      STORE_C(pg32, ptr_c1+4, oc3, vc3);
-      STORE_C(pg32, ptr_c2, oc4, vc4)
-      STORE_C(pg32, ptr_c2+4, oc5, vc5);
-      STORE_C(pg32, ptr_c3, oc6, vc6)
-      STORE_C(pg32, ptr_c3+4, oc7, vc7);
-
-      ptr_c0 += 8;
-      ptr_c1 += 8;
-      ptr_c2 += 8;
-      ptr_c3 += 8;
-    }
-
-    if (m & 4) {
-      ptr_a0 = ptr_a;
-      ptr_a += 4 * pad_k;
-      ptr_b0 = ptr_b;
-
-      LOAD_C(0, 0); LOAD_C(0, 1);
-      LOAD_C(1, 0); LOAD_C(1, 1);
-
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16, ptr_a0);
-        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
-        mb0 = svld1_bf16(pg16, ptr_b0);
-        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
-
-        MATMUL(0, 0); MATMUL(0, 1);
-        MATMUL(1, 0); MATMUL(1, 1);
-
-        ptr_a0 += 16;
-        ptr_b0 += 16;
-      }
-
-      vc0 = svuzp1(mc00, mc10);
-      vc1 = svuzp2(mc00, mc10);
-      vc2 = svuzp1(mc01, mc11);
-      vc3 = svuzp2(mc01, mc11);
-
-      STORE_C(pg32, ptr_c0, oc0, vc0);
-      STORE_C(pg32, ptr_c1, oc1, vc1);
-      STORE_C(pg32, ptr_c2, oc2, vc2);
-      STORE_C(pg32, ptr_c3, oc3, vc3);
-
-      ptr_c0 += 4;
-      ptr_c1 += 4;
-      ptr_c2 += 4;
-      ptr_c3 += 4;
-    }
-
-    if (m & 2) {
-      ptr_a0 = ptr_a;
-      ptr_a += 2 * pad_k;
-      ptr_b0 = ptr_b;
-
-      LOAD_C(0, 0); LOAD_C(0, 1);
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16, ptr_a0);
-        mb0 = svld1_bf16(pg16, ptr_b0);
-        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
-
-        MATMUL(0, 0); MATMUL(0, 1);
-
-        ptr_a0 += 8;
-        ptr_b0 += 16;
-      }
-
-      vc0 = svuzp1(mc00, mc00);
-      vc1 = svuzp2(mc00, mc00);
-      vc2 = svuzp1(mc01, mc01);
-      vc3 = svuzp2(mc01, mc01);
-
-      STORE_C(pg32_low, ptr_c0, oc0, vc0);
-      STORE_C(pg32_low, ptr_c1, oc1, vc1);
-      STORE_C(pg32_low, ptr_c2, oc2, vc2);
-      STORE_C(pg32_low, ptr_c3, oc3, vc3);
-
-      ptr_c0 += 2;
-      ptr_c1 += 2;
-      ptr_c2 += 2;
-      ptr_c3 += 2;
-    }
-
-    if (m & 1) {
-      ptr_a0 = ptr_a;
-      ptr_b0 = ptr_b;
-
-      LOAD_C(0, 0); LOAD_C(0, 1);
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16_low, ptr_a0);
-        mb0 = svld1_bf16(pg16, ptr_b0);
-        mb1 = svld1_bf16(pg16, ptr_b0 + 8);
-
-        MATMUL(0, 0); MATMUL(0, 1);
-
-        ptr_a0 += 4;
-        ptr_b0 += 16;
-      }
-
-      vc1 = svuzp2(mc00, mc00);
-      vc3 = svuzp2(mc01, mc01);
-
-      STORE_C(pg32_first, ptr_c0, oc0, mc00);
-      STORE_C(pg32_first, ptr_c1, oc1, vc1);
-      STORE_C(pg32_first, ptr_c2, oc2, mc01);
-      STORE_C(pg32_first, ptr_c3, oc3, vc3);
-
-    }
-
-    ptr_b += 4 * pad_k;
-  }
-
-  if (n & 2) {
-    ptr_c0 = ptr_c;
-    ptr_c1 = ptr_c0 + ldc;
-    ptr_c += 2 * ldc;
-    ptr_a = (bfloat16_t *)A;
-
-    for (BLASLONG i = 0; i < m / 8; i++) {
-      ptr_a0 = ptr_a;
-      ptr_a += 8 * pad_k;
-
-      ptr_b0 = ptr_b;
-
-      LOAD_C(0, 0);
-      LOAD_C(1, 0);
-      LOAD_C(2, 0);
-      LOAD_C(3, 0);
-
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16, ptr_a0);
-        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
-        ma2 = svld1_bf16(pg16, ptr_a0 + 16);
-        ma3 = svld1_bf16(pg16, ptr_a0 + 24);
-
-        mb0 = svld1_bf16(pg16, ptr_b0);
-
-        MATMUL(0, 0);
-        MATMUL(1, 0);
-        MATMUL(2, 0);
-        MATMUL(3, 0);
-
-        ptr_a0 += 32;
-        ptr_b0 += 8;
-      }
-
-      vc0 = svuzp1(mc00, mc10);
-      vc1 = svuzp1(mc20, mc30);
-      vc2 = svuzp2(mc00, mc10);
-      vc3 = svuzp2(mc20, mc30);
-
-      STORE_C(pg32, ptr_c0, oc0, vc0);
-      STORE_C(pg32, ptr_c0 + 4, oc1, vc1);
-      STORE_C(pg32, ptr_c1, oc2, vc2);
-      STORE_C(pg32, ptr_c1 + 4, oc3, vc3);
-
-      ptr_c0 += 8;
-      ptr_c1 += 8;
-    }
-
-    if (m & 4) {
-      ptr_a0 = ptr_a;
-      ptr_a += 4 * pad_k;
-      ptr_b0 = ptr_b;
-
-      LOAD_C(0, 0);
-      LOAD_C(1, 0);
-
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16, ptr_a0);
-        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
-        mb0 = svld1_bf16(pg16, ptr_b0);
-        MATMUL(0, 0);
-        MATMUL(1, 0);
-        ptr_a0 += 16;
-        ptr_b0 += 8;
-      }
-
-      vc0 = svuzp1(mc00, mc10);
-      vc1 = svuzp2(mc00, mc10);
-
-      STORE_C(pg32, ptr_c0, oc0, vc0);
-      STORE_C(pg32, ptr_c1, oc1, vc1);
-
-      ptr_c0 += 4;
-      ptr_c1 += 4;
-    }
-
-    if (m & 2) {
-      ptr_a0 = ptr_a;
-      ptr_a += 2 * pad_k;
-      ptr_b0 = ptr_b;
-
-      LOAD_C(0, 0);
-
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16, ptr_a0);
-        mb0 = svld1_bf16(pg16, ptr_b0);
-
-        MATMUL(0, 0);
-
-        ptr_a0 += 8;
-        ptr_b0 += 8;
-      }
-
-      vc0 = svuzp1(mc00, mc00);
-      vc1 = svuzp2(mc00, mc00);
-      STORE_C(pg32_low, ptr_c0, oc0, vc0);
-      STORE_C(pg32_low, ptr_c1, oc1, vc1);
-
-      ptr_c0 += 2;
-      ptr_c1 += 2;
-
-    }
-
-    if (m & 1) {
-      ptr_a0 = ptr_a;
-      ptr_b0 = ptr_b;
-      LOAD_C(0, 0);
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16_low, ptr_a0);
-        mb0 = svld1_bf16(pg16, ptr_b0);
-        MATMUL(0, 0);
-        ptr_a0 += 4;
-        ptr_b0 += 8;
-      }
-      vc1 = svuzp2(mc00, mc00);
-
-      STORE_C(pg32_first, ptr_c0, oc0, mc00);
-      STORE_C(pg32_first, ptr_c1, oc1, vc1);
-    }
-
-    ptr_b += 2 * pad_k;
-  }
-
-  if (n & 1) {
-    ptr_c0 = ptr_c;
-    ptr_a = (bfloat16_t *)A;
-
-    for (BLASLONG i = 0; i < m / 8; i++) {
-      ptr_a0 = ptr_a;
-      ptr_a += 8 * pad_k;
-
-      ptr_b0 = ptr_b;
-
-      LOAD_C(0, 0);
-      LOAD_C(1, 0);
-      LOAD_C(2, 0);
-      LOAD_C(3, 0);
-
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16, ptr_a0);
-        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
-        ma2 = svld1_bf16(pg16, ptr_a0 + 16);
-        ma3 = svld1_bf16(pg16, ptr_a0 + 24);
-
-        mb0 = svld1_bf16(pg16_low, ptr_b0);
-
-        MATMUL(0, 0);
-        MATMUL(1, 0);
-        MATMUL(2, 0);
-        MATMUL(3, 0);
-
-        ptr_a0 += 32;
-        ptr_b0 += 4;
-      }
-
-      vc0 = svuzp1(mc00, mc10);
-      vc1 = svuzp1(mc20, mc30);
-
-      STORE_C(pg32, ptr_c0, oc0, vc0);
-      STORE_C(pg32, ptr_c0 + 4, oc1, vc1);
-
-      ptr_c0 += 8;
-    }
-
-    if (m & 4) {
-      ptr_a0 = ptr_a;
-      ptr_a += 4 * pad_k;
-      ptr_b0 = ptr_b;
-      LOAD_C(0, 0);
-      LOAD_C(1, 0);
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16, ptr_a0);
-        ma1 = svld1_bf16(pg16, ptr_a0 + 8);
-        mb0 = svld1_bf16(pg16_low, ptr_b0);
-        MATMUL(0, 0);
-        MATMUL(1, 0);
-        ptr_a0 += 16;
-        ptr_b0 += 4;
-      }
-      vc0 = svuzp1(mc00, mc10);
-      STORE_C(pg32, ptr_c0, oc0, vc0);
-      ptr_c0 += 4;
-    }
-
-    if (m & 2) {
-      ptr_a0 = ptr_a;
-      ptr_a += 2 * pad_k;
-      ptr_b0 = ptr_b;
-
-      LOAD_C(0, 0);
-
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16, ptr_a0);
-        mb0 = svld1_bf16(pg16_low, ptr_b0);
-
-        MATMUL(0, 0);
-
-        ptr_a0 += 8;
-        ptr_b0 += 4;
-      }
-      vc0 = svuzp1(mc00, mc00);
-      STORE_C(pg32_low, ptr_c0, oc0, vc0);
-      ptr_c0 += 2;
-    }
-
-    if (m & 1) {
-      ptr_a0 = ptr_a;
-      ptr_b0 = ptr_b;
-      LOAD_C(0, 0);
-      for (BLASLONG p = 0; p < pad_k; p += 4) {
-        ma0 = svld1_bf16(pg16_low, ptr_a0);
-        mb0 = svld1_bf16(pg16_low, ptr_b0);
-        MATMUL(0, 0);
-        ptr_a0 += 4;
-        ptr_b0 += 4;
-      }
-      STORE_C(pg32_first, ptr_c0, oc0, mc00);
-    }
-  }
-
-  return 0;
-}
diff --git a/kernel/arm64/sbgemm_ncopy_4_neoversen2.c b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c
index 0b0e7a427..22978a388 100644
--- a/kernel/arm64/sbgemm_ncopy_4_neoversen2.c
+++ b/kernel/arm64/sbgemm_ncopy_4_neoversen2.c
@@ -58,17 +58,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
       svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2);
       svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3);
 
-#if 0
-      for (int line = 0; line < 4; line++) {
-        for (int p = 0; p < 4; p++) {
-          float tmp = 0;
-          *((bfloat16 *)(&tmp) + 1) = b_offset[line * 4 + p];
-          printf("%f ", tmp);
-        }
-        printf("\n");
-      }
-#endif
-
       b_offset += 16;
       a_offsetx[0] += 4;
       a_offsetx[1] += 4;
diff --git a/kernel/arm64/sbgemm_ncopy_neoversen2.c b/kernel/arm64/sbgemm_ncopy_neoversen2.c
deleted file mode 100644
index 594067ebb..000000000
--- a/kernel/arm64/sbgemm_ncopy_neoversen2.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/***************************************************************************
- * Copyright (c) 2022, The OpenBLAS Project
- * All rights reserved.
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * 3. Neither the name of the OpenBLAS project nor the names of
- * its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * *****************************************************************************/
-
-#include "common.h"
-
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
-    IFLOAT *a_offset, *a_offset1, *a_offset2;
-    IFLOAT *b_offset;
-
-    a_offset = a;
-    b_offset = b;
-
-    for (BLASLONG j = 0; j < n / 2; j++) {
-        a_offset1 = a_offset;
-        a_offset2 = a_offset1 + lda;
-        a_offset += 2 * lda;
-        for (BLASLONG i = 0; i < m / 4; i++) {
-            *(b_offset + 0) = *(a_offset1 + 0);
-            *(b_offset + 1) = *(a_offset1 + 1);
-            *(b_offset + 2) = *(a_offset1 + 2);
-            *(b_offset + 3) = *(a_offset1 + 3);
-            *(b_offset + 4) = *(a_offset2 + 0);
-            *(b_offset + 5) = *(a_offset2 + 1);
-            *(b_offset + 6) = *(a_offset2 + 2);
-            *(b_offset + 7) = *(a_offset2 + 3);
-
-            a_offset1 += 4;
-            a_offset2 += 4;
-            b_offset += 8;
-        }
-        BLASLONG rest = m & 3;
-        if (rest == 3) {
-            *(b_offset + 0) = *(a_offset1 + 0);
-            *(b_offset + 1) = *(a_offset1 + 1);
-            *(b_offset + 2) = *(a_offset1 + 2);
-            *(b_offset + 3) = *(a_offset2 + 0);
-            *(b_offset + 4) = *(a_offset2 + 1);
-            *(b_offset + 5) = *(a_offset2 + 2);
-            b_offset += 6;
-        } else if (rest == 2) {
-            *(b_offset + 0) = *(a_offset1 + 0);
-            *(b_offset + 1) = *(a_offset1 + 1);
-            *(b_offset + 2) = *(a_offset2 + 0);
-            *(b_offset + 3) = *(a_offset2 + 1);
-            b_offset += 4;
-        } else if (rest == 1) {
-            *(b_offset + 0) = *(a_offset1 + 0);
-            *(b_offset + 1) = *(a_offset2 + 0);
-            b_offset += 2;
-        }
-    }
-    if (n & 1) {
-        for (BLASLONG i = 0; i < m / 4; i++) {
-            *(b_offset + 0) = *(a_offset + 0);
-            *(b_offset + 1) = *(a_offset + 1);
-            *(b_offset + 2) = *(a_offset + 2);
-            *(b_offset + 3) = *(a_offset + 3);
-
-            b_offset += 4;
-            a_offset += 4;
-        }
-        BLASLONG rest = m & 3;
-        if (rest == 3) {
-            *(b_offset + 0) = *(a_offset + 0);
-            *(b_offset + 1) = *(a_offset + 1);
-            *(b_offset + 2) = *(a_offset + 2);
-        } else if (rest == 2) {
-            *(b_offset + 0) = *(a_offset + 0);
-            *(b_offset + 1) = *(a_offset + 1);
-        } else if (rest == 1) {
-            *(b_offset + 0) = *(a_offset + 0);
-        }
-    }
-
-    return 0;
-}
diff --git a/kernel/arm64/sbgemm_tcopy_8_neoversen2.c b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c
index 6c37e4bcf..a058b5a8e 100644
--- a/kernel/arm64/sbgemm_tcopy_8_neoversen2.c
+++ b/kernel/arm64/sbgemm_tcopy_8_neoversen2.c
@@ -43,15 +43,6 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
 
     for (BLASLONG i = 0; i < m / 4; i++) {
       for (BLASLONG line = 0; line < 8; line++) {
-#if 0
-        float fv0 = 0, fv1 = 0, fv2 = 0, fv3 = 0;
-        *((bfloat16 *)(&fv0) + 1) = a_offset0[line];
-        *((bfloat16 *)(&fv1) + 1) = a_offset1[line];
-        *((bfloat16 *)(&fv2) + 1) = a_offset2[line];
-        *((bfloat16 *)(&fv3) + 1) = a_offset3[line];
-        printf("%f %f %f %f\n", fv0, fv1, fv2, fv3);
-#endif
-
         b_offset[line * 4] = a_offset0[line];
         b_offset[line * 4 + 1] = a_offset1[line];
         b_offset[line * 4 + 2] = a_offset2[line];
diff --git a/kernel/arm64/sbgemm_tcopy_neoversen2.c b/kernel/arm64/sbgemm_tcopy_neoversen2.c
deleted file mode 100644
index 2f3313379..000000000
--- a/kernel/arm64/sbgemm_tcopy_neoversen2.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/***************************************************************************
- * Copyright (c) 2022, The OpenBLAS Project
- * All rights reserved.
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- * 3. Neither the name of the OpenBLAS project nor the names of
- * its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- * *****************************************************************************/
-
-#include "common.h"
-
-
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
-    IFLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
-    IFLOAT *b_offset;
-    a_offset = a;
-    b_offset = b;
-
-    for (BLASLONG j = 0; j < n / 2; j++) {
-        a_offset1 = a_offset;
-        a_offset2 = a_offset1 + lda;
-        a_offset3 = a_offset2 + lda;
-        a_offset4 = a_offset3 + lda;
-        a_offset += 2;
-
-        for (BLASLONG i = 0; i < m / 4; i++) {
-            *(b_offset + 0) = *(a_offset1 + 0);
-            *(b_offset + 1) = *(a_offset2 + 0);
-            *(b_offset + 2) = *(a_offset3 + 0);
-            *(b_offset + 3) = *(a_offset4 + 0);
-            *(b_offset + 4) = *(a_offset1 + 1);
-            *(b_offset + 5) = *(a_offset2 + 1);
-            *(b_offset + 6) = *(a_offset3 + 1);
-            *(b_offset + 7) = *(a_offset4 + 1);
-
-            b_offset += 8;
-            a_offset1 += 4 * lda;
-            a_offset2 += 4 * lda;
-            a_offset3 += 4 * lda;
-            a_offset4 += 4 * lda;
-        }
-        
-        if (m & 3) {
-            BLASLONG rest = m & 3;
-            if (rest == 3) {
-                *(b_offset + 0) = *(a_offset1 + 0);
-                *(b_offset + 1) = *(a_offset2 + 0);
-                *(b_offset + 2) = *(a_offset3 + 0);
-                *(b_offset + 3) = *(a_offset1 + 1);
-                *(b_offset + 4) = *(a_offset2 + 1);
-                *(b_offset + 5) = *(a_offset3 + 1);
-                b_offset += 6;
-            } else if (rest == 2) {
-                *(b_offset + 0) = *(a_offset1 + 0);
-                *(b_offset + 1) = *(a_offset2 + 0);
-                *(b_offset + 2) = *(a_offset1 + 1);
-                *(b_offset + 3) = *(a_offset2 + 1);
-                b_offset += 4;
-            } else if (rest == 1) {
-                *(b_offset + 0) = *(a_offset1 + 0);
-                *(b_offset + 1) = *(a_offset1 + 1);
-                b_offset += 2;
-            }
-        }
-    }
-    if (n & 1) {
-        for (BLASLONG i = 0; i < m / 4; i++) {
-            *(b_offset + 0) = *(a_offset);
-            *(b_offset + 1) = *(a_offset + lda);
-            *(b_offset + 2) = *(a_offset + lda * 2);
-            *(b_offset + 3) = *(a_offset + lda * 3);
-
-            b_offset += 4;
-            a_offset += 4 * lda;
-        }
-        BLASLONG rest = m & 3;
-        if (rest == 3) {
-            *(b_offset + 0) = *(a_offset);
-            *(b_offset + 1) = *(a_offset + lda);
-            *(b_offset + 2) = *(a_offset + lda * 2);
-        } else if (rest == 2) {
-            *(b_offset + 0) = *(a_offset);
-            *(b_offset + 1) = *(a_offset + lda);
-        } else if (rest == 1) {
-            *(b_offset + 0) = *(a_offset);
-        }
-    }
-
-    return 0;
-}

From e27ad3a6cc248a3a21da9f3cbc6855c5b48cff04 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Fri, 28 Oct 2022 09:10:40 -0400
Subject: [PATCH 048/154] add raptor lake ids

---
 cpuid_x86.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 4ac1de047..357376f42 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1544,6 +1544,13 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
         }
         break;
+      case 11: //family 6 exmodel 11
+        switch (model) {
+          case 7: // Raptor Lake
+            if(support_avx2())
+              return CPUTYPE_HASWELL;
+        }
+        break;
       }
       break;    
     case 0x7:
@@ -2334,6 +2341,12 @@ int get_coretype(void){
 	  return CORE_NEHALEM;
         }
 
+      case 11:
+	switch (model) {
+	  case 7: // Raptor Lake
+          if(support_avx2())
+            return CORE_HASWELL;
+	}
       case 15:
 	if (model <= 0x2) return CORE_NORTHWOOD;
 	else return CORE_PRESCOTT;

From 06b022b139c82c07a00f7b76e46c31b49b2cd728 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 30 Oct 2022 12:42:36 +0100
Subject: [PATCH 049/154] Fix ReLAPACK source selection

---
 CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c92356e7..e830589e8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -212,10 +212,10 @@ if(NOT NO_LAPACKE)
   add_library(LAPACKE OBJECT ${LAPACKE_SOURCES})
   list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>")
 endif()
-if(BUILD_RELAPACK)
-  add_library(RELAPACK OBJECT ${RELA_SOURCES})
-  list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
-endif()
+#if(BUILD_RELAPACK)
+#  add_library(RELAPACK OBJECT ${RELA_SOURCES})
+#  list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
+#endif()
 set(OpenBLAS_LIBS "")
 if(BUILD_STATIC_LIBS)
   add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})

From eeebaf22948192c151c87903865de603e93f2874 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 30 Oct 2022 12:45:54 +0100
Subject: [PATCH 050/154] move INCLUDE_ALL to (c)make options

---
 relapack/config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/relapack/config.h b/relapack/config.h
index 9d6919463..914efcbf0 100644
--- a/relapack/config.h
+++ b/relapack/config.h
@@ -45,7 +45,7 @@
 // The following macros specify which routines are included in the library under
 // LAPACK's symbol names: 1 included, 0 not included
 
-#define INCLUDE_ALL 1
+// #define INCLUDE_ALL 1
 
 #define INCLUDE_XLAUUM INCLUDE_ALL
 #define INCLUDE_SLAUUM INCLUDE_XLAUUM

From a082d54035d1e32db2dea16c74013c6ae6dc056d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 30 Oct 2022 12:47:01 +0100
Subject: [PATCH 051/154] Rename to avoid conflict with OpenBLAS' toplevel
 config.h

---
 relapack/{config.h => relapack_config.h} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename relapack/{config.h => relapack_config.h} (100%)

diff --git a/relapack/config.h b/relapack/relapack_config.h
similarity index 100%
rename from relapack/config.h
rename to relapack/relapack_config.h

From 3ebf5d219d41f0613f08ba89e9998ae8333d6118 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 30 Oct 2022 12:49:07 +0100
Subject: [PATCH 052/154] handle INCLUDE_ALL and optional function prefixes

---
 relapack/Makefile | 86 ++++++++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 39 deletions(-)

diff --git a/relapack/Makefile b/relapack/Makefile
index ddf101bd1..056a0ee48 100644
--- a/relapack/Makefile
+++ b/relapack/Makefile
@@ -1,53 +1,61 @@
 TOPDIR  = ..
 include $(TOPDIR)/Makefile.system
 
-
+ifeq ($(RELAPACK_REPLACE),0)
+RELAPREFIX=RELAPACK_
+INCLALL=-DINCLUDE_ALL=0
+else
+INCLALL=-DINCLUDE_ALL=1
+endif
 
 SRC = $(wildcard src/*.c)
 
 SRC1 = \
-	src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \
-	src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \
-	src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \
-	src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c
+	slauum.c clauum.c dlauum.c zlauum.c \
+	strtri.c dtrtri.c ctrtri.c ztrtri.c \
+	spotrf.c dpotrf.c cpotrf.c zpotrf.c \
+	sgetrf.c dgetrf.c cgetrf.c zgetrf.c
 
 SRC2 = \
-	src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
-	src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
-	src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
-	src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
-	src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
-	src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c  src/dtrsyl_rec2.c  src/ztrsyl_rec2.c \
-	src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c  \
-	src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \
-	src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c
+	cgbtrf.c cpbtrf.c dsytrf_rec2.c sgbtrf.c ssytrf_rook.c zhegst.c zsytrf_rec2.c \
+	cgemmt.c dgbtrf.c dsytrf_rook.c sgemmt.c ssytrf_rook_rec2.c zhetrf.c zsytrf_rook.c \
+	csytrf.c dgemmt.c dsytrf_rook_rec2.c stgsyl.c zhetrf_rec2.c zsytrf_rook_rec2.c \
+	chegst.c csytrf_rec2.c dtgsyl.c strsyl.c zhetrf_rook.c ztgsyl.c \
+	chetrf.c csytrf_rook.c dtrsyl.c spbtrf.c strsyl_rec2.c zhetrf_rook_rec2.c ztrsyl.c \
+	chetrf_rec2.c csytrf_rook_rec2.c dpbtrf.c dtrsyl_rec2.c ztrsyl_rec2.c \
+	chetrf_rook.c ctgsyl.c ssygst.c zgbtrf.c zpbtrf.c  \
+	chetrf_rook_rec2.c ctrsyl.c dsygst.c f2c.c ssytrf.c zgemmt.c \
+	ctrsyl_rec2.c dsytrf.c lapack_wrappers.c ssytrf_rec2.c zsytrf.c
 
 SRCX = \
-	src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
-	src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
-	src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
-	src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
-	src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
-	src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c  src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \
-	src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \
-	src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \
-	src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c
-
-OBJS1 = $(SRC1:%.c=%.$(SUFFIX))
-OBJS2 = $(SRC2:%.c=%.o)
+	cgbtrf.c cpbtrf.c ctrtri.c dsytrf_rec2.c sgbtrf.c ssytrf_rook.c zhegst.c zsytrf_rec2.c \
+	cgemmt.c cpotrf.c dgbtrf.c dsytrf_rook.c sgemmt.c ssytrf_rook_rec2.c zhetrf.c zsytrf_rook.c \
+	cgetrf.c csytrf.c dgemmt.c dsytrf_rook_rec2.c sgetrf.c stgsyl.c zhetrf_rec2.c zsytrf_rook_rec2.c \
+	chegst.c csytrf_rec2.c dgetrf.c dtgsyl.c slauum.c strsyl.c zhetrf_rook.c ztgsyl.c \
+	chetrf.c csytrf_rook.c dlauum.c dtrsyl.c spbtrf.c strsyl_rec2.c zhetrf_rook_rec2.c ztrsyl.c \
+	chetrf_rec2.c csytrf_rook_rec2.c dpbtrf.c dtrsyl_rec2.c spotrf.c strtri.c zlauum.c ztrsyl_rec2.c \
+	chetrf_rook.c ctgsyl.c dpotrf.c dtrtri.c ssygst.c zgbtrf.c zpbtrf.c ztrtri.c \
+	chetrf_rook_rec2.c ctrsyl.c dsygst.c f2c.c ssytrf.c zgemmt.c zpotrf.c \
+	clauum.c ctrsyl_rec2.c dsytrf.c lapack_wrappers.c ssytrf_rec2.c zgetrf.c zsytrf.c
+
+
+OBJS1 = $(SRC1:%.c=src/$(RELAPREFIX)%.$(SUFFIX))
+OBJS2 = $(SRC2:%.c=src/$(RELAPREFIX)%.o)
 OBJS = $(OBJS1) $(OBJS2)
 
 TEST_SUITS = \
-	slauum dlauum clauum zlauum \
-	spotrf dpotrf cpotrf zpotrf \
-	spbtrf dpbtrf cpbtrf zpbtrf \
-	ssygst dsygst chegst zhegst \
-	ssytrf dsytrf csytrf chetrf zsytrf zhetrf \
-	sgetrf dgetrf cgetrf zgetrf \
-	sgbtrf dgbtrf cgbtrf zgbtrf \
-	strsyl dtrsyl ctrsyl ztrsyl \
-	stgsyl dtgsyl ctgsyl ztgsyl \
 	sgemmt dgemmt cgemmt zgemmt
+
+	#	slauum dlauum clauum zlauum \
+	#	spotrf dpotrf cpotrf zpotrf \
+	#	spbtrf dpbtrf cpbtrf zpbtrf \
+	#	ssygst dsygst chegst zhegst \
+	#	ssytrf dsytrf csytrf chetrf zsytrf zhetrf \
+	#	sgetrf dgetrf cgetrf zgetrf \
+	#	sgbtrf dgbtrf cgbtrf zgbtrf \
+	#	strsyl dtrsyl ctrsyl ztrsyl \
+	#	stgsyl dtgsyl ctgsyl ztgsyl \
+
 TESTS = $(TEST_SUITS:%=test/%.pass)  # dummies
 TEST_EXES = $(TEST_SUITS:%=test/%.x)
 
@@ -63,11 +71,11 @@ libs:	$(OBJS)
 	$(AR) -r  $(TOPDIR)/$(LIBNAME) $(OBJS)
 	$(RANLIB) $(TOPDIR)/$(LIBNAME)
 
-%.$(SUFFIX): %.c config.h
-	$(CC) $(CFLAGS) -c $< -o $@
+src/$(RELAPREFIX)%.$(SUFFIX): src/%.c relapack_config.h
+	$(CC) -v $(CFLAGS) -I. $(INCLALL) -c $< -o $@
 
-%.o: %.c config.h
-	$(CC) $(CFLAGS) -c $< -o $@
+src/$(RELAPREFIX)%.o: src/%.c relapack_config.h
+	$(CC) -v $(CFLAGS) -I. $(INCLALL) -c $< -o $@
 
 
 # ReLAPACK testing

From ce7ea72de101707c1c0b8b4f9830e6dc7d25a44b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 30 Oct 2022 12:50:51 +0100
Subject: [PATCH 053/154] Fix include paths

---
 relapack/src/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/relapack/src/CMakeLists.txt b/relapack/src/CMakeLists.txt
index 2d861f54b..78fb1431f 100644
--- a/relapack/src/CMakeLists.txt
+++ b/relapack/src/CMakeLists.txt
@@ -1,5 +1,6 @@
 include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_BINARY_DIR})
+include_directories(${PROJECT_SOURCE_DIR}/relapack)
 
 set(RELAFILES
 clauum.c

From d39978cd7ff702e2a9d3df439814a7a8a511deb0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 30 Oct 2022 12:53:19 +0100
Subject: [PATCH 054/154] Fix includes

---
 relapack/src/ctrsyl_rec2.c | 2 +-
 relapack/src/relapack.h    | 4 ++--
 relapack/src/ztrsyl_rec2.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c
index 556491c7a..674d73709 100644
--- a/relapack/src/ctrsyl_rec2.c
+++ b/relapack/src/ctrsyl_rec2.c
@@ -10,7 +10,7 @@
 		http://www.netlib.org/f2c/libf2c.zip
 */
 
-#include "../config.h"
+#include "relapack_config.h"
 #include "f2c.h"
 
 #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h
index 38c5c30d0..44652a074 100644
--- a/relapack/src/relapack.h
+++ b/relapack/src/relapack.h
@@ -1,7 +1,7 @@
 #ifndef RELAPACK_INT_H
 #define RELAPACK_INT_H
 #include <string.h>
-#include "../../config.h"
+#include "config.h"
 #if defined(OS_WINDOWS) && defined(__64BIT__)
 typedef long long BLASLONG;
 typedef unsigned long long BLASULONG;
@@ -9,7 +9,7 @@ typedef unsigned long long BLASULONG;
 typedef long BLASLONG;
 typedef unsigned long BLASULONG;
 #endif
-#include "../config.h"
+#include "relapack_config.h"
 
 #include "../inc/relapack.h"
 
diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c
index edc6ffc6b..d07a4e8de 100644
--- a/relapack/src/ztrsyl_rec2.c
+++ b/relapack/src/ztrsyl_rec2.c
@@ -10,7 +10,7 @@
 		http://www.netlib.org/f2c/libf2c.zip
 */
 
-#include "../config.h"
+#include "relapack_config.h"
 #include "f2c.h"
 
 #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES

From ea6c5f3cf553a23f8e2e787307805e7874e1f9c6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 30 Oct 2022 12:55:23 +0100
Subject: [PATCH 055/154] Add option RELAPACK_REPLACE

---
 Makefile.rule   | 5 ++++-
 Makefile.system | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index a0ad90a68..9665d951a 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -131,6 +131,9 @@ BUILD_LAPACK_DEPRECATED = 1
 
 # Build RecursiveLAPACK on top of LAPACK
 # BUILD_RELAPACK = 1
+# Have RecursiveLAPACK actually replace standard LAPACK routines instead of 
+# just adding its equivalents with a RELAPACK_ prefix
+# RELAPACK_REPLACE = 1
 
 # If you want to use the legacy threaded Level 3 implementation.
 # USE_SIMPLE_THREADED_LEVEL3 = 1
@@ -207,7 +210,7 @@ NO_AFFINITY = 1
 # to the user space. If bigphysarea is enabled, it will use it.
 # DEVICEDRIVER_ALLOCATION = 1
 
-# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only).
+# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
 # CONSISTENT_FPCSR = 1
 
 # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
diff --git a/Makefile.system b/Makefile.system
index 10b952d4b..3c29ab3f3 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -9,6 +9,10 @@ ifndef TOPDIR
 TOPDIR = . 
 endif
 
+ifndef RELAPACK_REPLACE
+RELAPACK_REPLACE=0
+endif
+
 # we need to use the host system's architecture for getarch compile options even especially when cross-compiling
 HOSTARCH := $(shell uname -m)
 ifeq ($(HOSTARCH), amd64)

From c9d78dc3b2d938a27a372fdad3b376397bf52da3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 31 Oct 2022 16:57:03 +0100
Subject: [PATCH 056/154] Remove excess initializer (leftover from rework of PR
 3793)

---
 kernel/setparam-ref.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 16d19af1b..522c6d7d9 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -870,7 +870,6 @@ gotoblas_t TABLE_NAME = {
 #if BUILD_COMPLEX16==1
   zgeadd_kTS,
 #endif
-  1,  // align_k
 };
 
 #if (ARCH_ARM64)

From c970717157ff601a3e53e7b2f60ae2ec467799c4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 1 Nov 2022 13:51:20 +0100
Subject: [PATCH 057/154] fix missing t in xgemmt rule

Co-authored-by: Alexis <35051714+amontoison@users.noreply.github.com>
---
 interface/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/Makefile b/interface/Makefile
index a1f4f66da..6f320d8f7 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -1337,7 +1337,7 @@ cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h
 zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
-xgemmt.$(SUFFIX) xgemm.$(PSUFFIX) : gemmt.c ../param.h
+xgemmt.$(SUFFIX) xgemmt.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c

From da6e426b13d4c7ad7ac066dd462c25f66c564322 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 3 Nov 2022 18:13:35 +0100
Subject: [PATCH 058/154] fix Cooperlake not selectable via environment
 variable

---
 driver/others/dynamic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 9a693b06f..f61930983 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -1018,7 +1018,7 @@ static gotoblas_t *force_coretype(char *coretype){
 	char message[128];
 	//char mname[20];
 
-	for ( i=1 ; i <= 24; i++)
+	for ( i=1 ; i <= 25; i++)
 	{
 		if (!strncasecmp(coretype,corename[i],20))
 		{

From fcda11c1ae0c50d5ab393352d8b78084a4e1dcad Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 5 Nov 2022 23:48:50 +0100
Subject: [PATCH 059/154] Revert special handling of GEMMT

---
 relapack/relapack_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/relapack/relapack_config.h b/relapack/relapack_config.h
index 914efcbf0..ba428a61b 100644
--- a/relapack/relapack_config.h
+++ b/relapack/relapack_config.h
@@ -115,7 +115,7 @@
 #define INCLUDE_CTGSYL INCLUDE_XTGSYL
 #define INCLUDE_ZTGSYL INCLUDE_XTGSYL
 
-#define INCLUDE_XGEMMT 1
+#define INCLUDE_XGEMMT INCLUDE_ALL
 #define INCLUDE_SGEMMT INCLUDE_XGEMMT
 #define INCLUDE_DGEMMT INCLUDE_XGEMMT
 #define INCLUDE_CGEMMT INCLUDE_XGEMMT

From 1b777641825f9f97f2fb0a3386d32e1d106c36db Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Nov 2022 12:02:59 +0100
Subject: [PATCH 060/154] Conditionally leave out bits of LAPACK to be
 overridden by ReLAPACK

---
 interface/CMakeLists.txt | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 0b2998237..4e082928b 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES
 # these do not have separate 'z' sources
 set(BLAS3_SOURCES
   gemm.c symm.c
-  trsm.c syrk.c syr2k.c
+  trsm.c syrk.c syr2k.c gemmt.c
 )
 
 set(BLAS3_MANGLED_SOURCES
@@ -189,7 +189,16 @@ if (NOT DEFINED NO_LAPACK)
   )
 
   GenerateNamedObjects("${LAPACK_SOURCES}")
+  if (NOT RELAPACK_REPLACE)
   GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3)
+  else ()
+  GenerateNamedObjects("lapack/getrs.c" "" "" 0 "" "" 0 3)
+  GenerateNamedObjects("lapack/getf2.c" "" "" 0 "" "" 0 3)
+  GenerateNamedObjects("lapack/potf2.c" "" "" 0 "" "" 0 3)
+  GenerateNamedObjects("lapack/laswp.c" "" "" 0 "" "" 0 3)
+  GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3)
+  GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3)
+  endif()
 endif ()
 
 if ( BUILD_COMPLEX AND NOT  BUILD_SINGLE)

From aa2a2d9c01357befb2d168d6833332b3dc50f008 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Nov 2022 12:04:46 +0100
Subject: [PATCH 061/154] Conditionally compile files that may get replaced by
 ReLAPACK

---
 lapack/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
index fd4e57048..1d44e9490 100644
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -39,8 +39,12 @@ set(UNIT_SOURCES2
   trti2/trti2_L.c
 )
 
+if (NOT RELAPACK_REPLACE)
 GenerateNamedObjects("${LAPACK_SOURCES}")
 GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3)
+else()
+GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" false "" "" false 3)
+endif()
 
 GenerateNamedObjects("laswp/generic/laswp_k_4.c" "" "laswp_plus" false "" ""  false 3)
 GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" ""  false 3)
@@ -113,4 +117,3 @@ GenerateCombinationObjects("${UNIT_SOURCES}" "UNIT" "N" "" 4)
 GenerateCombinationObjects("${UNIT_SOURCES2}" "UNIT" "N" "" 0 "" "" 3)
 
 add_library(lapack OBJECT ${OPENBLAS_SRC})
-

From 2e64722681cd94ec3f7c077ee3f96c5350ddc352 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Nov 2022 16:20:17 +0100
Subject: [PATCH 062/154] Update Makefile.rule

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 9665d951a..5e6cefc22 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -210,7 +210,7 @@ NO_AFFINITY = 1
 # to the user space. If bigphysarea is enabled, it will use it.
 # DEVICEDRIVER_ALLOCATION = 1
 
-# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
+# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only).
 # CONSISTENT_FPCSR = 1
 
 # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute

From e6204d254f1ef1ca8524f7d82ceaf31cbe63c17b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Nov 2022 16:21:11 +0100
Subject: [PATCH 063/154] Update CMakeLists.txt

---
 interface/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 4e082928b..ce1434a90 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES
 # these do not have separate 'z' sources
 set(BLAS3_SOURCES
   gemm.c symm.c
-  trsm.c syrk.c syr2k.c gemmt.c
+  trsm.c syrk.c syr2k.c
 )
 
 set(BLAS3_MANGLED_SOURCES

From 1865b152403661a99fb4b99f2c94ad0d88629651 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 9 Nov 2022 10:31:30 +0100
Subject: [PATCH 064/154] Add fallbacks to RaptorLake entry

---
 cpuid_x86.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 357376f42..4afa931f0 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1549,6 +1549,10 @@ int get_cpuname(void){
           case 7: // Raptor Lake
             if(support_avx2())
               return CPUTYPE_HASWELL;
+	    if(support_avx())
+	      return CPUTYPE_SANDYBRIDGE;
+            else
+	      return CPUTYPE_NEHALEM;
         }
         break;
       }
@@ -2344,8 +2348,14 @@ int get_coretype(void){
       case 11:
 	switch (model) {
 	  case 7: // Raptor Lake
-          if(support_avx2())
+#ifndef NO_AVX2
+	  if(support_avx2())
             return CORE_HASWELL;
+#endif
+	  if(support_avx())
+	    return CORE_SANDYBRIDGE;
+	  else
+	  return CORE_NEHALEM;
 	}
       case 15:
 	if (model <= 0x2) return CORE_NORTHWOOD;

From c957ad684ed6b8ca64f332221b376f2ad0fdc51a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 9 Nov 2022 10:46:43 +0100
Subject: [PATCH 065/154] Bump gcc requirement for NeoverseN2 and V1 to 10.4

---
 Makefile.arm64 | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Makefile.arm64 b/Makefile.arm64
index 480684422..e2c471c2b 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -89,11 +89,11 @@ endif
 endif
 
 # Use a72 tunings because Neoverse-V1 is only available
-# in GCC>=9.4
+# in GCC>=10.4
 ifeq ($(CORE), NEOVERSEV1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ9), 1)
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
+ifeq ($(GCCVERSIONGTEQ10), 1)
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
 CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
@@ -119,11 +119,11 @@ endif
 endif
 
 # Use a72 tunings because Neoverse-N2 is only available
-# in GCC>=9.4
+# in GCC>=10.4
 ifeq ($(CORE), NEOVERSEN2)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ9), 1)
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
+ifeq ($(GCCVERSIONGTEQ10), 1)
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
 ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 else

From be546ec1ad283e8543a9a2ff181a019b6a753d26 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 9 Nov 2022 11:00:41 +0100
Subject: [PATCH 066/154] Add gcc options for Neoverse cpus

---
 cmake/cc.cmake | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 57e42781d..62278c4a7 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -155,6 +155,39 @@ if (${CORE} STREQUAL A64FX)
   endif ()
 endif ()
 
+if (${CORE} STREQUAL NEOVERSEN2)
+  if (NOT DYNAMIC_ARCH)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    else ()
+      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+    endif()
+  endif ()
+endif ()
+
+if (${CORE} STREQUAL NEOVERSEV1)
+  if (NOT DYNAMIC_ARCH)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
+    else ()
+      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+    endif()
+  endif ()
+endif ()
+
+if (${CORE} STREQUAL NEOVERSEN1)
+  if (NOT DYNAMIC_ARCH)
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4)
+      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1")
+    else ()
+      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+    endif()
+  endif ()
+endif ()
+
 if (${CORE} STREQUAL ARMV8SVE)
   if (NOT DYNAMIC_ARCH)
     set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")

From 6c1043eb41caaa1bda2f81a377316d191fd9947a Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bart.oldeman@mcgill.ca>
Date: Wed, 9 Nov 2022 08:28:23 -0500
Subject: [PATCH 067/154] Add [cz]scal microkernels for SKYLAKEX

These are as similar to dscal_microk_skylakex-2.c as possible
for consistency.

Note that before this change SKYLAKEX+ uses generic C functions for
cscal/zscal via commit 2271c350 from #2610 (which is masked by
commit 086d87a30). However now #3799 disables FMAs (in turn enabled
by `-march=skylake-avx512`) in the plain C code which fixes excessive
LAPACK test failures more nicely.
---
 kernel/x86_64/KERNEL.SKYLAKEX           |   3 -
 kernel/x86_64/cscal.c                   |   4 +-
 kernel/x86_64/cscal_microk_skylakex-2.c | 152 ++++++++++++++++++++++++
 kernel/x86_64/zscal.c                   |   4 +-
 kernel/x86_64/zscal_microk_skylakex-2.c | 152 ++++++++++++++++++++++++
 5 files changed, 310 insertions(+), 5 deletions(-)
 create mode 100644 kernel/x86_64/cscal_microk_skylakex-2.c
 create mode 100644 kernel/x86_64/zscal_microk_skylakex-2.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index cb6f62981..548e5dcfc 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -44,8 +44,5 @@ DGEMM_BETA = dgemm_beta_skylakex.c
 CGEMMKERNEL    =  cgemm_kernel_8x2_skylakex.c
 ZGEMMKERNEL    =  zgemm_kernel_4x2_skylakex.c
 
-CSCALKERNEL    = ../arm/zscal.c
-ZSCALKERNEL    = ../arm/zscal.c
-
 CASUMKERNEL = casum.c
 ZASUMKERNEL = zasum.c
diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c
index 6ae66d973..95a99b8b9 100644
--- a/kernel/x86_64/cscal.c
+++ b/kernel/x86_64/cscal.c
@@ -41,7 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
+#if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
+#include "cscal_microk_skylakex-2.c"
+#elif defined(HASWELL) || defined(ZEN)
 #include "cscal_microk_haswell-2.c"
 #elif defined(BULLDOZER)  || defined(PILEDRIVER)
 #include "cscal_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/cscal_microk_skylakex-2.c b/kernel/x86_64/cscal_microk_skylakex-2.c
new file mode 100644
index 000000000..8a622427b
--- /dev/null
+++ b/kernel/x86_64/cscal_microk_skylakex-2.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2014-2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#include <immintrin.h>
+
+#define HAVE_KERNEL_16 1
+
+static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+	BLASLONG i = 0;
+	BLASLONG n2 = n + n;
+
+#ifdef __AVX512CD__
+	/* _mm512_addsub_ps does not exist so we flip signs for odd elements of da_i */
+	__m512 da_r = _mm512_set1_ps(alpha[0]);
+	__m512 da_i = _mm512_set1_ps(alpha[1]) * _mm512_set4_ps(1, -1, 1, -1);
+	for (; i < n2; i += 32) {
+                __m512 x0 = _mm512_loadu_ps(&x[i +  0]);
+                __m512 x1 = _mm512_loadu_ps(&x[i + 16]);
+                __m512 y0 = _mm512_permute_ps(x0, 0xb1);
+                __m512 y1 = _mm512_permute_ps(x1, 0xb1);
+                _mm512_storeu_ps(&x[i +  0], _mm512_add_ps(da_r * x0, da_i * y0));
+                _mm512_storeu_ps(&x[i + 16], _mm512_add_ps(da_r * x1, da_i * y1));
+	}
+#else
+	__m256 da_r = _mm256_set1_ps(alpha[0]);
+	__m256 da_i = _mm256_set1_ps(alpha[1]);
+	for (; i < n2; i += 32) {
+                __m256 x0 = _mm256_loadu_ps(&x[i +  0]);
+                __m256 x1 = _mm256_loadu_ps(&x[i +  8]);
+                __m256 x2 = _mm256_loadu_ps(&x[i + 16]);
+                __m256 x3 = _mm256_loadu_ps(&x[i + 24]);
+                __m256 y0 = _mm256_permute_ps(x0, 0xb1);
+                __m256 y1 = _mm256_permute_ps(x1, 0xb1);
+                __m256 y2 = _mm256_permute_ps(x2, 0xb1);
+                __m256 y3 = _mm256_permute_ps(x3, 0xb1);
+                _mm256_storeu_ps(&x[i +  0], _mm256_addsub_ps(da_r * x0, da_i * y0));
+                _mm256_storeu_ps(&x[i +  8], _mm256_addsub_ps(da_r * x1, da_i * y1));
+                _mm256_storeu_ps(&x[i + 16], _mm256_addsub_ps(da_r * x2, da_i * y2));
+                _mm256_storeu_ps(&x[i + 24], _mm256_addsub_ps(da_r * x3, da_i * y3));
+	}
+#endif
+}
+
+
+static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+	BLASLONG i = 0;
+	BLASLONG n2 = n + n;
+
+#ifdef __AVX512CD__
+	__m512 da_i = _mm512_set1_ps(alpha[1]) * _mm512_set4_ps(1, -1, 1, -1);
+	for (; i < n2; i += 32) {
+                __m512 y0 = _mm512_permute_ps(_mm512_loadu_ps(&x[i +  0]), 0xb1);
+                __m512 y1 = _mm512_permute_ps(_mm512_loadu_ps(&x[i + 16]), 0xb1);
+                _mm512_storeu_ps(&x[i +  0], da_i * y0);
+                _mm512_storeu_ps(&x[i + 16], da_i * y1);
+	}
+#else
+	__m256 da_i = _mm256_set1_ps(alpha[1]) * _mm256_set_ps(1, -1, 1, -1, 1, -1, 1, -1);
+	for (; i < n2; i += 32) {
+                __m256 y0 = _mm256_permute_ps(_mm256_loadu_ps(&x[i +  0]), 0xb1);
+                __m256 y1 = _mm256_permute_ps(_mm256_loadu_ps(&x[i +  8]), 0xb1);
+                __m256 y2 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 16]), 0xb1);
+                __m256 y3 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 24]), 0xb1);
+                _mm256_storeu_ps(&x[i +  0], da_i * y0);
+                _mm256_storeu_ps(&x[i +  8], da_i * y1);
+                _mm256_storeu_ps(&x[i + 16], da_i * y2);
+                _mm256_storeu_ps(&x[i + 24], da_i * y3);
+	}
+#endif
+}
+
+
+static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+	BLASLONG i = 0;
+	BLASLONG n2 = n + n;
+
+#ifdef __AVX512CD__
+	__m512 da_r = _mm512_set1_ps(alpha[0]);
+	for (; i < n2; i += 32) {
+                _mm512_storeu_ps(&x[i +  0], da_r * _mm512_loadu_ps(&x[i +  0]));
+                _mm512_storeu_ps(&x[i + 16], da_r * _mm512_loadu_ps(&x[i + 16]));
+	}
+#else
+	__m256 da_r = _mm256_set1_ps(alpha[0]);
+	for (; i < n2; i += 32) {
+                _mm256_storeu_ps(&x[i +  0], da_r * _mm256_loadu_ps(&x[i +  0]));
+                _mm256_storeu_ps(&x[i +  8], da_r * _mm256_loadu_ps(&x[i +  8]));
+                _mm256_storeu_ps(&x[i + 16], da_r * _mm256_loadu_ps(&x[i + 16]));
+                _mm256_storeu_ps(&x[i + 24], da_r * _mm256_loadu_ps(&x[i + 24]));
+	}
+#endif
+}
+
+
+static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+	BLASLONG i = 0;
+	BLASLONG n2 = n + n;
+
+	/* question to self: Why is this not just memset() */
+
+#ifdef __AVX512CD__
+	__m512 zero = _mm512_setzero_ps();
+	for (; i < n2; i += 32) {
+                _mm512_storeu_ps(&x[i], zero);
+                _mm512_storeu_ps(&x[i + 16], zero);
+	}
+#else
+	__m256 zero = _mm256_setzero_ps();
+	for (; i < n2; i += 32) {
+                _mm256_storeu_ps(&x[i +  0], zero);
+                _mm256_storeu_ps(&x[i +  8], zero);
+                _mm256_storeu_ps(&x[i + 16], zero);
+                _mm256_storeu_ps(&x[i + 24], zero);
+	}
+#endif
+
+}
+
+#else
+#include "cscal_microk_haswell-2.c"
+#endif
diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index dfdb4230b..45e3531b8 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -41,7 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 
-#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
+#if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
+#include "zscal_microk_skylakex-2.c"
+#elif defined(HASWELL) || defined(ZEN)
 #include "zscal_microk_haswell-2.c"
 #elif defined(BULLDOZER)  || defined(PILEDRIVER)
 #include "zscal_microk_bulldozer-2.c"
diff --git a/kernel/x86_64/zscal_microk_skylakex-2.c b/kernel/x86_64/zscal_microk_skylakex-2.c
new file mode 100644
index 000000000..f9e05e333
--- /dev/null
+++ b/kernel/x86_64/zscal_microk_skylakex-2.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2014-2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+
+#include <immintrin.h>
+
+#define HAVE_KERNEL_8 1
+
+static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+	BLASLONG i = 0;
+	BLASLONG n2 = n + n;
+
+#ifdef __AVX512CD__
+	/* _mm512_addsub_pd does not exist so we flip signs for odd elements of da_i */
+	__m512d da_r = _mm512_set1_pd(alpha[0]);
+	__m512d da_i = _mm512_set1_pd(alpha[1]) * _mm512_set4_pd(1, -1, 1, -1);
+	for (; i < n2; i += 16) {
+                __m512d x0 = _mm512_loadu_pd(&x[i +  0]);
+                __m512d x1 = _mm512_loadu_pd(&x[i +  8]);
+                __m512d y0 = _mm512_permute_pd(x0, 0x55);
+                __m512d y1 = _mm512_permute_pd(x1, 0x55);
+                _mm512_storeu_pd(&x[i +  0], _mm512_add_pd(da_r * x0, da_i * y0));
+                _mm512_storeu_pd(&x[i +  8], _mm512_add_pd(da_r * x1, da_i * y1));
+	}
+#else
+	__m256d da_r = _mm256_set1_pd(alpha[0]);
+	__m256d da_i = _mm256_set1_pd(alpha[1]);
+	for (; i < n2; i += 16) {
+                __m256d x0 = _mm256_loadu_pd(&x[i +  0]);
+                __m256d x1 = _mm256_loadu_pd(&x[i +  4]);
+                __m256d x2 = _mm256_loadu_pd(&x[i +  8]);
+                __m256d x3 = _mm256_loadu_pd(&x[i + 12]);
+                __m256d y0 = _mm256_permute_pd(x0, 0x05);
+                __m256d y1 = _mm256_permute_pd(x1, 0x05);
+                __m256d y2 = _mm256_permute_pd(x2, 0x05);
+                __m256d y3 = _mm256_permute_pd(x3, 0x05);
+                _mm256_storeu_pd(&x[i +  0], _mm256_addsub_pd(da_r * x0, da_i * y0));
+                _mm256_storeu_pd(&x[i +  4], _mm256_addsub_pd(da_r * x1, da_i * y1));
+                _mm256_storeu_pd(&x[i +  8], _mm256_addsub_pd(da_r * x2, da_i * y2));
+                _mm256_storeu_pd(&x[i + 12], _mm256_addsub_pd(da_r * x3, da_i * y3));
+	}
+#endif
+}
+
+
+static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+	BLASLONG i = 0;
+	BLASLONG n2 = n + n;
+
+#ifdef __AVX512CD__
+	__m512d da_i = _mm512_set1_pd(alpha[1]) * _mm512_set4_pd(1, -1, 1, -1);
+	for (; i < n2; i += 16) {
+                __m512d y0 = _mm512_permute_pd(_mm512_loadu_pd(&x[i +  0]), 0x55);
+                __m512d y1 = _mm512_permute_pd(_mm512_loadu_pd(&x[i +  8]), 0x55);
+                _mm512_storeu_pd(&x[i +  0], da_i * y0);
+                _mm512_storeu_pd(&x[i +  8], da_i * y1);
+	}
+#else
+	__m256d da_i = _mm256_set1_pd(alpha[1]) * _mm256_set_pd(1, -1, 1, -1);
+	for (; i < n2; i += 16) {
+                __m256d y0 = _mm256_permute_pd(_mm256_loadu_pd(&x[i +  0]), 0x05);
+                __m256d y1 = _mm256_permute_pd(_mm256_loadu_pd(&x[i +  8]), 0x05);
+                __m256d y2 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 16]), 0x05);
+                __m256d y3 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 24]), 0x05);
+                _mm256_storeu_pd(&x[i +  0], da_i * y0);
+                _mm256_storeu_pd(&x[i +  4], da_i * y1);
+                _mm256_storeu_pd(&x[i +  8], da_i * y2);
+                _mm256_storeu_pd(&x[i + 12], da_i * y3);
+	}
+#endif
+}
+
+
+static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+	BLASLONG i = 0;
+	BLASLONG n2 = n + n;
+
+#ifdef __AVX512CD__
+	__m512d da_r = _mm512_set1_pd(alpha[0]);
+	for (; i < n2; i += 16) {
+                _mm512_storeu_pd(&x[i +  0], da_r * _mm512_loadu_pd(&x[i +  0]));
+                _mm512_storeu_pd(&x[i +  8], da_r * _mm512_loadu_pd(&x[i +  8]));
+	}
+#else
+	__m256d da_r = _mm256_set1_pd(alpha[0]);
+	for (; i < n2; i += 16) {
+                _mm256_storeu_pd(&x[i +  0], da_r * _mm256_loadu_pd(&x[i +  0]));
+                _mm256_storeu_pd(&x[i +  4], da_r * _mm256_loadu_pd(&x[i +  4]));
+                _mm256_storeu_pd(&x[i +  8], da_r * _mm256_loadu_pd(&x[i +  8]));
+                _mm256_storeu_pd(&x[i + 12], da_r * _mm256_loadu_pd(&x[i + 12]));
+	}
+#endif
+}
+
+
+static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+{
+	BLASLONG i = 0;
+	BLASLONG n2 = n + n;
+
+	/* question to self: Why is this not just memset() */
+
+#ifdef __AVX512CD__
+	__m512d zero = _mm512_setzero_pd();
+	for (; i < n2; i += 16) {
+                _mm512_storeu_pd(&x[i], zero);
+                _mm512_storeu_pd(&x[i +  8], zero);
+	}
+#else
+	__m256d zero = _mm256_setzero_pd();
+	for (; i < n2; i += 16) {
+                _mm256_storeu_pd(&x[i +  0], zero);
+                _mm256_storeu_pd(&x[i +  4], zero);
+                _mm256_storeu_pd(&x[i +  8], zero);
+                _mm256_storeu_pd(&x[i + 12], zero);
+	}
+#endif
+
+}
+
+#else
+#include "zscal_microk_haswell-2.c"
+#endif

From 09dd90ca09cd61d14afa8b2f63fa7c250154ff07 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 9 Nov 2022 15:35:57 +0100
Subject: [PATCH 068/154] Limit cpu models in OSX_dynarch_cmake

---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 2de6ec6ba..16b9da4f5 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -204,7 +204,7 @@ jobs:
   - script: |
       mkdir build
       cd build
-      cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
+      cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
       cmake --build .
       ctest
 

From d141cf341f4e2c80f47e76296439335ded59a356 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 9 Nov 2022 20:31:30 +0100
Subject: [PATCH 069/154] Increase the wait time for ppc jobs again

---
 .travis.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a4edad726..06db6a95c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,7 +30,7 @@ matrix:
       before_script: &common-before
         - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
       script:
-        - travis_wait 40 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+        - travis_wait 50 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
         - make -C test $COMMON_FLAGS $BTYPE
         - make -C ctest $COMMON_FLAGS $BTYPE
         - make -C utest $COMMON_FLAGS $BTYPE
@@ -104,7 +104,7 @@ matrix:
         - sudo apt-get update
         - sudo apt-get install gcc-9 gfortran-9 -y
       script:
-        - travis_wait 40 make QUIET_MAKE=1  BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
+        - travis_wait 50 make QUIET_MAKE=1  BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
         - make -C test $COMMON_FLAGS $BTYPE
         - make -C ctest $COMMON_FLAGS $BTYPE
         - make -C utest $COMMON_FLAGS $BTYPE 
@@ -121,7 +121,7 @@ matrix:
         - sudo apt-get update
         - sudo apt-get install gcc-9 gfortran-9 -y
       script:
-        - travis_wait 40 make QUIET_MAKE=1  BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
+        - travis_wait 50 make QUIET_MAKE=1  BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
         - make -C test $COMMON_FLAGS $BTYPE
         - make -C ctest $COMMON_FLAGS $BTYPE
         - make -C utest $COMMON_FLAGS $BTYPE 

From cb48c29b6f43f1da0a4bb24c9ec1f7add06996c2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 12:49:59 +0100
Subject: [PATCH 070/154] Fix workspace calculation (Reference-LAPACK PR690)

---
 lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f | 17 ++++++++++++-----
 lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f | 17 ++++++++++++-----
 lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f | 17 ++++++++++++-----
 lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f | 17 ++++++++++++-----
 4 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
index 369ed1983..46eaf33b9 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
@@ -81,7 +81,8 @@ C> \verbatim
 C>          LWORK is INTEGER
 C> \endverbatim
 C> \verbatim
-C>          The dimension of the array WORK. The dimension can be divided into three parts.
+C>          The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0,
+C>          otherwise the dimension can be divided into three parts.
 C> \endverbatim
 C> \verbatim
 C>          1) The part for the triangular factor T. If the very last T is not bigger
@@ -212,7 +213,13 @@ C>
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
       LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
 
-      IF ( NT.GT.NB ) THEN
+      IF( K.EQ.0 ) THEN
+
+         LBWORK = 0
+         LWKOPT = 1
+         WORK( 1 ) = LWKOPT
+
+      ELSE IF ( NT.GT.NB ) THEN
 
           LBWORK = K-NT
 *
@@ -239,8 +246,9 @@ C>
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF ( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CGEQRF', -INFO )
@@ -252,7 +260,6 @@ C>
 *     Quick return if possible
 *
       IF( K.EQ.0 ) THEN
-         WORK( 1 ) = 1
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
index be5720f4f..55cab8b23 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
@@ -81,7 +81,8 @@ C> \verbatim
 C>          LWORK is INTEGER
 C> \endverbatim
 C> \verbatim
-C>          The dimension of the array WORK. The dimension can be divided into three parts.
+C>          The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0,
+C>          otherwise the dimension can be divided into three parts.
 C> \endverbatim
 C> \verbatim
 C>          1) The part for the triangular factor T. If the very last T is not bigger
@@ -212,7 +213,13 @@ C>
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
       LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
 
-      IF ( NT.GT.NB ) THEN
+      IF( K.EQ.0 ) THEN
+
+         LBWORK = 0
+         LWKOPT = 1
+         WORK( 1 ) = LWKOPT
+
+      ELSE IF ( NT.GT.NB ) THEN
 
           LBWORK = K-NT
 *
@@ -239,8 +246,9 @@ C>
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF ( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DGEQRF', -INFO )
@@ -252,7 +260,6 @@ C>
 *     Quick return if possible
 *
       IF( K.EQ.0 ) THEN
-         WORK( 1 ) = 1
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
index bff973214..d2ad13ced 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
@@ -81,7 +81,8 @@ C> \verbatim
 C>          LWORK is INTEGER
 C> \endverbatim
 C> \verbatim
-C>          The dimension of the array WORK. The dimension can be divided into three parts.
+C>          The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0,
+C>          otherwise the dimension can be divided into three parts.
 C> \endverbatim
 C> \verbatim
 C>          1) The part for the triangular factor T. If the very last T is not bigger
@@ -212,7 +213,13 @@ C>
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
       LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
 
-      IF ( NT.GT.NB ) THEN
+      IF( K.EQ.0 ) THEN
+
+         LBWORK = 0
+         LWKOPT = 1
+         WORK( 1 ) = LWKOPT
+
+      ELSE IF ( NT.GT.NB ) THEN
 
           LBWORK = K-NT
 *
@@ -239,8 +246,9 @@ C>
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF ( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SGEQRF', -INFO )
@@ -252,7 +260,6 @@ C>
 *     Quick return if possible
 *
       IF( K.EQ.0 ) THEN
-         WORK( 1 ) = 1
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
index 79e86b41b..623b88a8a 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
@@ -81,7 +81,8 @@ C> \verbatim
 C>          LWORK is INTEGER
 C> \endverbatim
 C> \verbatim
-C>          The dimension of the array WORK. The dimension can be divided into three parts.
+C>          The dimension of the array WORK. LWORK >= 1 if MIN(M,N) = 0,
+C>          otherwise the dimension can be divided into three parts.
 C> \endverbatim
 C> \verbatim
 C>          1) The part for the triangular factor T. If the very last T is not bigger
@@ -212,7 +213,13 @@ C>
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
       LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
 
-      IF ( NT.GT.NB ) THEN
+      IF( K.EQ.0 ) THEN
+
+         LBWORK = 0
+         LWKOPT = 1
+         WORK( 1 ) = LWKOPT
+
+      ELSE IF ( NT.GT.NB ) THEN
 
           LBWORK = K-NT
 *
@@ -239,8 +246,9 @@ C>
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF ( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZGEQRF', -INFO )
@@ -252,7 +260,6 @@ C>
 *     Quick return if possible
 *
       IF( K.EQ.0 ) THEN
-         WORK( 1 ) = 1
          RETURN
       END IF
 *

From 3e2d52c502fc85c29c25a2e49ab8a46d4b1bebc2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 13:00:52 +0100
Subject: [PATCH 071/154] Fix workspace calculation in GEQRF/GERQF
 (Reference-LAPACK PR 638)

---
 lapack-netlib/SRC/sgeqrf.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/sgeqrf.f b/lapack-netlib/SRC/sgeqrf.f
index f47d8bf32..b24615f7a 100644
--- a/lapack-netlib/SRC/sgeqrf.f
+++ b/lapack-netlib/SRC/sgeqrf.f
@@ -204,7 +204,7 @@
       END IF
 *
 *     Quick return if possible
-*
+*     
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN

From 6dcf737c5d9c8404ec7a4fda5a4958bdf669d418 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 14:51:39 +0100
Subject: [PATCH 072/154] Add NaN check functions for trapezoidal matrices
 (Reference-LAPACK PR738+742)

---
 lapack-netlib/LAPACKE/include/lapacke_utils.h | 32 ++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/lapack-netlib/LAPACKE/include/lapacke_utils.h b/lapack-netlib/LAPACKE/include/lapacke_utils.h
index f84604e8a..332a5024f 100644
--- a/lapack-netlib/LAPACKE/include/lapacke_utils.h
+++ b/lapack-netlib/LAPACKE/include/lapacke_utils.h
@@ -68,7 +68,7 @@ void LAPACKE_xerbla( const char *name, lapack_int info );
 /* Compare two chars (case-insensitive) */
 lapack_logical LAPACKE_lsame( char ca,  char cb )
 #if defined __GNUC__
-	__attribute__((const))
+  __attribute__((const))
 #endif
 	;
 
@@ -128,6 +128,10 @@ void LAPACKE_ctp_trans( int matrix_layout, char uplo, char diag,
 void LAPACKE_ctr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                         const lapack_complex_float *in, lapack_int ldin,
                         lapack_complex_float *out, lapack_int ldout );
+void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo,
+                        char diag, lapack_int m, lapack_int n,
+                        const lapack_complex_float *in, lapack_int ldin,
+                        lapack_complex_float *out, lapack_int ldout );
 
 void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                         lapack_int kl, lapack_int ku,
@@ -178,6 +182,10 @@ void LAPACKE_dtp_trans( int matrix_layout, char uplo, char diag,
 void LAPACKE_dtr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                         const double *in, lapack_int ldin,
                         double *out, lapack_int ldout );
+void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo,
+                        char diag, lapack_int m, lapack_int n,
+                        const double *in, lapack_int ldin,
+                        double *out, lapack_int ldout );
 
 void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                         lapack_int kl, lapack_int ku,
@@ -228,6 +236,10 @@ void LAPACKE_stp_trans( int matrix_layout, char uplo, char diag,
 void LAPACKE_str_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                         const float *in, lapack_int ldin,
                         float *out, lapack_int ldout );
+void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo,
+                        char diag, lapack_int m, lapack_int n,
+                        const float *in, lapack_int ldin,
+                        float *out, lapack_int ldout );
 
 void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                         lapack_int kl, lapack_int ku,
@@ -284,6 +296,10 @@ void LAPACKE_ztp_trans( int matrix_layout, char uplo, char diag,
 void LAPACKE_ztr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                         const lapack_complex_double *in, lapack_int ldin,
                         lapack_complex_double *out, lapack_int ldout );
+void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo,
+                        char diag, lapack_int m, lapack_int n,
+                        const lapack_complex_double *in, lapack_int ldin,
+                        lapack_complex_double *out, lapack_int ldout );
 
 /* NaN checkers */
 #define LAPACK_SISNAN( x ) ( x != x )
@@ -376,6 +392,10 @@ lapack_logical LAPACKE_ctr_nancheck( int matrix_layout, char uplo, char diag,
                                       lapack_int n,
                                       const lapack_complex_float *a,
                                       lapack_int lda );
+lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo,
+                                     char diag, lapack_int m, lapack_int n,
+                                     const lapack_complex_float *a,
+                                     lapack_int lda );
 
 lapack_logical LAPACKE_dgb_nancheck( int matrix_layout, lapack_int m,
                                       lapack_int n, lapack_int kl,
@@ -440,6 +460,9 @@ lapack_logical LAPACKE_dtr_nancheck( int matrix_layout, char uplo, char diag,
                                       lapack_int n,
                                       const double *a,
                                       lapack_int lda );
+lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo,
+                                     char diag, lapack_int m, lapack_int n,
+                                     const double *a, lapack_int lda );
 
 lapack_logical LAPACKE_sgb_nancheck( int matrix_layout, lapack_int m,
                                       lapack_int n, lapack_int kl,
@@ -504,6 +527,9 @@ lapack_logical LAPACKE_str_nancheck( int matrix_layout, char uplo, char diag,
                                       lapack_int n,
                                       const float *a,
                                       lapack_int lda );
+lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo,
+                                     char diag, lapack_int m, lapack_int n,
+                                     const float *a, lapack_int lda );
 
 lapack_logical LAPACKE_zgb_nancheck( int matrix_layout, lapack_int m,
                                       lapack_int n, lapack_int kl,
@@ -574,6 +600,10 @@ lapack_logical LAPACKE_ztr_nancheck( int matrix_layout, char uplo, char diag,
                                       lapack_int n,
                                       const lapack_complex_double *a,
                                       lapack_int lda );
+lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo,
+                                     char diag, lapack_int m, lapack_int n,
+                                     const lapack_complex_double *a,
+                                     lapack_int lda );
 
 #ifdef __cplusplus
 }

From 23cfe58ee37a382a621cef75e12b6bc64e0d6a84 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 14:55:45 +0100
Subject: [PATCH 073/154] Add NaN check functions for trapezoidal matrices
 (Reference-LAPACK PR738+742)

---
 lapack-netlib/LAPACKE/src/lapacke_clantr.c    |  6 +-
 lapack-netlib/LAPACKE/src/lapacke_clarfb.c    | 68 ++++++-------------
 .../LAPACKE/src/lapacke_clarfb_work.c         | 57 +++++-----------
 lapack-netlib/LAPACKE/src/lapacke_dlantr.c    |  2 +-
 lapack-netlib/LAPACKE/src/lapacke_dlarfb.c    | 68 ++++++-------------
 .../LAPACKE/src/lapacke_dlarfb_work.c         | 57 +++++-----------
 lapack-netlib/LAPACKE/src/lapacke_slantr.c    |  2 +-
 lapack-netlib/LAPACKE/src/lapacke_slarfb.c    | 68 ++++++-------------
 .../LAPACKE/src/lapacke_slarfb_work.c         | 57 +++++-----------
 lapack-netlib/LAPACKE/src/lapacke_zlantr.c    |  2 +-
 lapack-netlib/LAPACKE/src/lapacke_zlarfb.c    | 68 ++++++-------------
 .../LAPACKE/src/lapacke_zlarfb_work.c         | 57 +++++-----------
 12 files changed, 150 insertions(+), 362 deletions(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr.c b/lapack-netlib/LAPACKE/src/lapacke_clantr.c
index 88e765f2b..e00b6c578 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clantr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clantr.c
@@ -33,8 +33,8 @@
 #include "lapacke_utils.h"
 
 float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag,
-                           lapack_int m, lapack_int n, const lapack_complex_float* a,
-                           lapack_int lda )
+                      lapack_int m, lapack_int n, const lapack_complex_float* a,
+                      lapack_int lda )
 {
     lapack_int info = 0;
     float res = 0.;
@@ -46,7 +46,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_ctr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) {
+        if( LAPACKE_ctz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) {
             return -7;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
index ccd34cecd..8b1492bec 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
@@ -42,7 +42,9 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
     lapack_int info = 0;
     lapack_int ldwork;
     lapack_complex_float* work = NULL;
-    lapack_int ncols_v, nrows_v;
+    lapack_int nrows_v, ncols_v;
+    lapack_logical left, col, forward;
+    char uplo;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_clarfb", -1 );
         return -1;
@@ -50,59 +52,27 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        lapack_int lrv, lcv;  /* row, column stride */
-        if( matrix_layout == LAPACK_COL_MAJOR ) {
-            lrv = 1;
-            lcv = ldv;
-        } else {
-            lrv = ldv;
-            lcv = 1;
-        }
-        ncols_v =     LAPACKE_lsame( storev, 'c' ) ? k :
-                  ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
-                  ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
+        left = LAPACKE_lsame( side, 'l' );
+        col = LAPACKE_lsame( storev, 'c' );
+        forward = LAPACKE_lsame( direct, 'f' );
 
-        nrows_v =   ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
-                  ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
-                    ( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
-        if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
-            return -13;
+        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
+        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
+        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+
+        if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) {
+            LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
+            return -8;
+        }
+        if( LAPACKE_ctz_nancheck( matrix_layout, direct, uplo, 'u',
+                                  nrows_v, ncols_v, v, ldv ) ) {
+            return -9;
         }
         if( LAPACKE_cge_nancheck( matrix_layout, k, k, t, ldt ) ) {
             return -11;
         }
-        if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
-                return -9;
-            if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v,
-                                      &v[k*lrv], ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > nrows_v ) {
-                LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
-                return -8;
-            }
-            if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k,
-                                      &v[(nrows_v-k)*lrv], ldv ) )
-                return -9;
-            if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
-                return -9;
-            if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k,
-                                      &v[k*lrv], ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > ncols_v ) {
-                LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
-                return -8;
-            }
-            if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k,
-                                      &v[(ncols_v-k)*lcv], ldv ) )
-                return -9;
-            if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
-                return -9;
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
+            return -13;
         }
     }
 #endif
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
index 3ad97c22d..90ff0851f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
@@ -42,6 +42,8 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
 {
     lapack_int info = 0;
     lapack_int nrows_v, ncols_v;
+    lapack_logical left, col, forward;
+    char uplo;
     lapack_int ldc_t, ldt_t, ldv_t;
     lapack_complex_float *v_t = NULL, *t_t = NULL, *c_t = NULL;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
@@ -52,16 +54,14 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
-                             LAPACKE_lsame( side, 'l' ) ) ? m :
-                             ( ( LAPACKE_lsame( storev, 'c' ) &&
-                             LAPACKE_lsame( side, 'r' ) ) ? n :
-                             ( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
-        ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
-                             ( ( LAPACKE_lsame( storev, 'r' ) &&
-                             LAPACKE_lsame( side, 'l' ) ) ? m :
-                             ( ( LAPACKE_lsame( storev, 'r' ) &&
-                             LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
+        left = LAPACKE_lsame( side, 'l' );
+        col = LAPACKE_lsame( storev, 'c' );
+        forward = LAPACKE_lsame( direct, 'f' );
+
+        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
+        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
+        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
         ldv_t = MAX(1,nrows_v);
@@ -81,6 +81,11 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
             LAPACKE_xerbla( "LAPACKE_clarfb_work", info );
             return info;
         }
+        if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_clarfb_work", info );
+            return info;
+        }
         /* Allocate memory for temporary array(s) */
         v_t = (lapack_complex_float*)
             LAPACKE_malloc( sizeof(lapack_complex_float) *
@@ -102,36 +107,8 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
             goto exit_level_2;
         }
         /* Transpose input matrices */
-        if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            LAPACKE_ctr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t );
-            LAPACKE_cge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv,
-                               &v_t[k], ldv_t );
-        } else if( LAPACKE_lsame( storev, 'c' ) &&
-                   LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > nrows_v ) {
-                LAPACKE_xerbla( "LAPACKE_clarfb_work", -8 );
-                return -8;
-            }
-            LAPACKE_ctr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv],
-                               ldv, &v_t[nrows_v-k], ldv_t );
-            LAPACKE_cge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t,
-                               ldv_t );
-        } else if( LAPACKE_lsame( storev, 'r' ) &&
-                   LAPACKE_lsame( direct, 'f' ) ) {
-            LAPACKE_ctr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t );
-            LAPACKE_cge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv,
-                               &v_t[k*ldv_t], ldv_t );
-        } else if( LAPACKE_lsame( storev, 'r' ) &&
-                   LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > ncols_v ) {
-                LAPACKE_xerbla( "LAPACKE_clarfb_work", -8 );
-                return -8;
-            }
-            LAPACKE_ctr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv,
-                               &v_t[(ncols_v-k)*ldv_t], ldv_t );
-            LAPACKE_cge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t,
-                               ldv_t );
-        }
+        LAPACKE_ctz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v,
+                           v, ldv, v_t, ldv_t );
         LAPACKE_cge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t );
         LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
         /* Call LAPACK function and adjust info */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c
index 4d1be93d7..b20af0eb4 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlantr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr.c
@@ -46,7 +46,7 @@ double LAPACKE_dlantr( int matrix_layout, char norm, char uplo, char diag,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dtr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) {
+        if( LAPACKE_dtz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) {
             return -7;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
index 3c3c24c54..82e8fae52 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
@@ -41,7 +41,9 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
     lapack_int info = 0;
     lapack_int ldwork;
     double* work = NULL;
-    lapack_int ncols_v, nrows_v;
+    lapack_int nrows_v, ncols_v;
+    lapack_logical left, col, forward;
+    char uplo;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_dlarfb", -1 );
         return -1;
@@ -49,59 +51,27 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        lapack_int lrv, lcv;  /* row, column stride */
-        if( matrix_layout == LAPACK_COL_MAJOR ) {
-            lrv = 1;
-            lcv = ldv;
-        } else {
-            lrv = ldv;
-            lcv = 1;
-        }
-        ncols_v =     LAPACKE_lsame( storev, 'c' ) ? k :
-                  ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
-                  ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
+        left = LAPACKE_lsame( side, 'l' );
+        col = LAPACKE_lsame( storev, 'c' );
+        forward = LAPACKE_lsame( direct, 'f' );
 
-        nrows_v =   ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
-                  ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
-                    ( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
-        if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
-            return -13;
+        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
+        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
+        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+
+        if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) {
+            LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
+            return -8;
+        }
+        if( LAPACKE_dtz_nancheck( matrix_layout, direct, uplo, 'u',
+                                  nrows_v, ncols_v, v, ldv ) ) {
+            return -9;
         }
         if( LAPACKE_dge_nancheck( matrix_layout, k, k, t, ldt ) ) {
             return -11;
         }
-        if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
-                return -9;
-            if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v,
-                                      &v[k*lrv], ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > nrows_v ) {
-                LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
-                return -8;
-            }
-            if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k,
-                                      &v[(nrows_v-k)*lrv], ldv ) )
-                return -9;
-            if( LAPACKE_dge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            if( LAPACKE_dtr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
-                return -9;
-            if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k,
-                                      &v[k*lrv], ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > ncols_v ) {
-                LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
-                return -8;
-            }
-            if( LAPACKE_dtr_nancheck( matrix_layout, 'l', 'u', k,
-                                      &v[(ncols_v-k)*lcv], ldv ) )
-                return -9;
-            if( LAPACKE_dge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
-                return -9;
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
+            return -13;
         }
     }
 #endif
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
index 57c53bae3..1a68bf762 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
@@ -41,6 +41,8 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans,
 {
     lapack_int info = 0;
     lapack_int nrows_v, ncols_v;
+    lapack_logical left, col, forward;
+    char uplo;
     lapack_int ldc_t, ldt_t, ldv_t;
     double *v_t = NULL, *t_t = NULL, *c_t = NULL;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
@@ -51,16 +53,14 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
-                             LAPACKE_lsame( side, 'l' ) ) ? m :
-                             ( ( LAPACKE_lsame( storev, 'c' ) &&
-                             LAPACKE_lsame( side, 'r' ) ) ? n :
-                             ( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
-        ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
-                             ( ( LAPACKE_lsame( storev, 'r' ) &&
-                             LAPACKE_lsame( side, 'l' ) ) ? m :
-                             ( ( LAPACKE_lsame( storev, 'r' ) &&
-                             LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
+        left = LAPACKE_lsame( side, 'l' );
+        col = LAPACKE_lsame( storev, 'c' );
+        forward = LAPACKE_lsame( direct, 'f' );
+
+        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
+        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
+        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
         ldv_t = MAX(1,nrows_v);
@@ -80,6 +80,11 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans,
             LAPACKE_xerbla( "LAPACKE_dlarfb_work", info );
             return info;
         }
+        if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_dlarfb_work", info );
+            return info;
+        }
         /* Allocate memory for temporary array(s) */
         v_t = (double*)
             LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,ncols_v) );
@@ -98,36 +103,8 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans,
             goto exit_level_2;
         }
         /* Transpose input matrices */
-        if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            LAPACKE_dtr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t );
-            LAPACKE_dge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv,
-                               &v_t[k], ldv_t );
-        } else if( LAPACKE_lsame( storev, 'c' ) &&
-                   LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > nrows_v ) {
-                LAPACKE_xerbla( "LAPACKE_dlarfb_work", -8 );
-                return -8;
-            }
-            LAPACKE_dtr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv],
-                               ldv, &v_t[nrows_v-k], ldv_t );
-            LAPACKE_dge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t,
-                               ldv_t );
-        } else if( LAPACKE_lsame( storev, 'r' ) &&
-                   LAPACKE_lsame( direct, 'f' ) ) {
-            LAPACKE_dtr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t );
-            LAPACKE_dge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv,
-                               &v_t[k*ldv_t], ldv_t );
-        } else if( LAPACKE_lsame( storev, 'r' ) &&
-                   LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > ncols_v ) {
-                LAPACKE_xerbla( "LAPACKE_dlarfb_work", -8 );
-                return -8;
-            }
-            LAPACKE_dtr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv,
-                               &v_t[(ncols_v-k)*ldv_t], ldv_t );
-            LAPACKE_dge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t,
-                               ldv_t );
-        }
+        LAPACKE_dtz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v,
+                           v, ldv, v_t, ldv_t );
         LAPACKE_dge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t );
         LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
         /* Call LAPACK function and adjust info */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr.c b/lapack-netlib/LAPACKE/src/lapacke_slantr.c
index 2f4c65889..e2f67cfd6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slantr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slantr.c
@@ -46,7 +46,7 @@ float LAPACKE_slantr( int matrix_layout, char norm, char uplo, char diag,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_str_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) {
+        if( LAPACKE_stz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) {
             return -7;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
index 37d51dee5..892648f4b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
@@ -41,7 +41,9 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
     lapack_int info = 0;
     lapack_int ldwork;
     float* work = NULL;
-    lapack_int ncols_v, nrows_v;
+    lapack_int nrows_v, ncols_v;
+    lapack_logical left, col, forward;
+    char uplo;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_slarfb", -1 );
         return -1;
@@ -49,59 +51,27 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        lapack_int lrv, lcv;  /* row, column stride */
-        if( matrix_layout == LAPACK_COL_MAJOR ) {
-            lrv = 1;
-            lcv = ldv;
-        } else {
-            lrv = ldv;
-            lcv = 1;
-        }
-        ncols_v =     LAPACKE_lsame( storev, 'c' ) ? k :
-                  ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
-                  ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
+        left = LAPACKE_lsame( side, 'l' );
+        col = LAPACKE_lsame( storev, 'c' );
+        forward = LAPACKE_lsame( direct, 'f' );
 
-        nrows_v =   ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
-                  ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
-                    ( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
-        if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
-            return -13;
+        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
+        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
+        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+
+        if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) {
+            LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
+            return -8;
+        }
+        if( LAPACKE_stz_nancheck( matrix_layout, direct, uplo, 'u',
+                                  nrows_v, ncols_v, v, ldv ) ) {
+            return -9;
         }
         if( LAPACKE_sge_nancheck( matrix_layout, k, k, t, ldt ) ) {
             return -11;
         }
-        if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
-                return -9;
-            if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v,
-                                      &v[k*lrv], ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > nrows_v ) {
-                LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
-                return -8;
-            }
-            if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k,
-                                      &v[(nrows_v-k)*lrv], ldv ) )
-                return -9;
-            if( LAPACKE_sge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            if( LAPACKE_str_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
-                return -9;
-            if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k,
-                                      &v[k*lrv], ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > ncols_v ) {
-                LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
-                return -8;
-            }
-            if( LAPACKE_str_nancheck( matrix_layout, 'l', 'u', k,
-                                      &v[(ncols_v-k)*lcv], ldv ) )
-                return -9;
-            if( LAPACKE_sge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
-                return -9;
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
+            return -13;
         }
     }
 #endif
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
index 2f5d61676..d805a947a 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
@@ -41,6 +41,8 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans,
 {
     lapack_int info = 0;
     lapack_int nrows_v, ncols_v;
+    lapack_logical left, col, forward;
+    char uplo;
     lapack_int ldc_t, ldt_t, ldv_t;
     float *v_t = NULL, *t_t = NULL, *c_t = NULL;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
@@ -51,16 +53,14 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
-                             LAPACKE_lsame( side, 'l' ) ) ? m :
-                             ( ( LAPACKE_lsame( storev, 'c' ) &&
-                             LAPACKE_lsame( side, 'r' ) ) ? n :
-                             ( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
-        ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
-                             ( ( LAPACKE_lsame( storev, 'r' ) &&
-                             LAPACKE_lsame( side, 'l' ) ) ? m :
-                             ( ( LAPACKE_lsame( storev, 'r' ) &&
-                             LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
+        left = LAPACKE_lsame( side, 'l' );
+        col = LAPACKE_lsame( storev, 'c' );
+        forward = LAPACKE_lsame( direct, 'f' );
+
+        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
+        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
+        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
         ldv_t = MAX(1,nrows_v);
@@ -80,6 +80,11 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans,
             LAPACKE_xerbla( "LAPACKE_slarfb_work", info );
             return info;
         }
+        if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_slarfb_work", info );
+            return info;
+        }
         /* Allocate memory for temporary array(s) */
         v_t = (float*)LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,ncols_v) );
         if( v_t == NULL ) {
@@ -97,36 +102,8 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans,
             goto exit_level_2;
         }
         /* Transpose input matrices */
-        if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            LAPACKE_str_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t );
-            LAPACKE_sge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv,
-                               &v_t[k], ldv_t );
-        } else if( LAPACKE_lsame( storev, 'c' ) &&
-                   LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > nrows_v ) {
-                LAPACKE_xerbla( "LAPACKE_slarfb_work", -8 );
-                return -8;
-            }
-            LAPACKE_str_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv],
-                               ldv, &v_t[nrows_v-k], ldv_t );
-            LAPACKE_sge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t,
-                               ldv_t );
-        } else if( LAPACKE_lsame( storev, 'r' ) &&
-                   LAPACKE_lsame( direct, 'f' ) ) {
-            LAPACKE_str_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t );
-            LAPACKE_sge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv,
-                               &v_t[k*ldv_t], ldv_t );
-        } else if( LAPACKE_lsame( storev, 'r' ) &&
-                   LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > ncols_v ) {
-                LAPACKE_xerbla( "LAPACKE_slarfb_work", -8 );
-                return -8;
-            }
-            LAPACKE_str_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv,
-                               &v_t[(ncols_v-k)*ldv_t], ldv_t );
-            LAPACKE_sge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t,
-                               ldv_t );
-        }
+        LAPACKE_stz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v,
+                           v, ldv, v_t, ldv_t );
         LAPACKE_sge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t );
         LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
         /* Call LAPACK function and adjust info */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c
index f6656d84d..4c078b9b0 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlantr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr.c
@@ -46,7 +46,7 @@ double LAPACKE_zlantr( int matrix_layout, char norm, char uplo, char diag,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_ztr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) {
+        if( LAPACKE_ztz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) {
             return -7;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
index 7cd23dde8..25cedb506 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
@@ -42,7 +42,9 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
     lapack_int info = 0;
     lapack_int ldwork;
     lapack_complex_double* work = NULL;
-    lapack_int ncols_v, nrows_v;
+    lapack_int nrows_v, ncols_v;
+    lapack_logical left, col, forward;
+    char uplo;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_zlarfb", -1 );
         return -1;
@@ -50,59 +52,27 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        lapack_int lrv, lcv;  /* row, column stride */
-        if( matrix_layout == LAPACK_COL_MAJOR ) {
-            lrv = 1;
-            lcv = ldv;
-        } else {
-            lrv = ldv;
-            lcv = 1;
-        }
-        ncols_v =     LAPACKE_lsame( storev, 'c' ) ? k :
-                  ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
-                  ( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
+        left = LAPACKE_lsame( side, 'l' );
+        col = LAPACKE_lsame( storev, 'c' );
+        forward = LAPACKE_lsame( direct, 'f' );
 
-        nrows_v =   ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
-                  ( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
-                    ( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
-        if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
-            return -13;
+        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
+        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
+        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+
+        if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) {
+            LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
+            return -8;
+        }
+        if( LAPACKE_ztz_nancheck( matrix_layout, direct, uplo, 'u',
+                                  nrows_v, ncols_v, v, ldv ) ) {
+            return -9;
         }
         if( LAPACKE_zge_nancheck( matrix_layout, k, k, t, ldt ) ) {
             return -11;
         }
-        if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
-                return -9;
-            if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v,
-                                      &v[k*lrv], ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > nrows_v ) {
-                LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
-                return -8;
-            }
-            if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k,
-                                      &v[(nrows_v-k)*lrv], ldv ) )
-                return -9;
-            if( LAPACKE_zge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            if( LAPACKE_ztr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
-                return -9;
-            if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k,
-                                      &v[k*lrv], ldv ) )
-                return -9;
-        } else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > ncols_v ) {
-                LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
-                return -8;
-            }
-            if( LAPACKE_ztr_nancheck( matrix_layout, 'l', 'u', k,
-                                      &v[(ncols_v-k)*lcv], ldv ) )
-                return -9;
-            if( LAPACKE_zge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
-                return -9;
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
+            return -13;
         }
     }
 #endif
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
index 1b4f892a1..64eb05263 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
@@ -42,6 +42,8 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans,
 {
     lapack_int info = 0;
     lapack_int nrows_v, ncols_v;
+    lapack_logical left, col, forward;
+    char uplo;
     lapack_int ldc_t, ldt_t, ldv_t;
     lapack_complex_double *v_t = NULL, *t_t = NULL, *c_t = NULL;
     if( matrix_layout == LAPACK_COL_MAJOR ) {
@@ -52,16 +54,14 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
-                             LAPACKE_lsame( side, 'l' ) ) ? m :
-                             ( ( LAPACKE_lsame( storev, 'c' ) &&
-                             LAPACKE_lsame( side, 'r' ) ) ? n :
-                             ( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
-        ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
-                             ( ( LAPACKE_lsame( storev, 'r' ) &&
-                             LAPACKE_lsame( side, 'l' ) ) ? m :
-                             ( ( LAPACKE_lsame( storev, 'r' ) &&
-                             LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
+        left = LAPACKE_lsame( side, 'l' );
+        col = LAPACKE_lsame( storev, 'c' );
+        forward = LAPACKE_lsame( direct, 'f' );
+
+        nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
+        ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
+        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
         ldv_t = MAX(1,nrows_v);
@@ -81,6 +81,11 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans,
             LAPACKE_xerbla( "LAPACKE_zlarfb_work", info );
             return info;
         }
+        if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_zlarfb_work", info );
+            return info;
+        }
         /* Allocate memory for temporary array(s) */
         v_t = (lapack_complex_double*)
             LAPACKE_malloc( sizeof(lapack_complex_double) *
@@ -102,36 +107,8 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans,
             goto exit_level_2;
         }
         /* Transpose input matrices */
-        if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
-            LAPACKE_ztr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t );
-            LAPACKE_zge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv,
-                               &v_t[k], ldv_t );
-        } else if( LAPACKE_lsame( storev, 'c' ) &&
-                   LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > nrows_v ) {
-                LAPACKE_xerbla( "LAPACKE_zlarfb_work", -8 );
-                return -8;
-            }
-            LAPACKE_ztr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv],
-                               ldv, &v_t[nrows_v-k], ldv_t );
-            LAPACKE_zge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t,
-                               ldv_t );
-        } else if( LAPACKE_lsame( storev, 'r' ) &&
-                   LAPACKE_lsame( direct, 'f' ) ) {
-            LAPACKE_ztr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t );
-            LAPACKE_zge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv,
-                               &v_t[k*ldv_t], ldv_t );
-        } else if( LAPACKE_lsame( storev, 'r' ) &&
-                   LAPACKE_lsame( direct, 'b' ) ) {
-            if( k > ncols_v ) {
-                LAPACKE_xerbla( "LAPACKE_zlarfb_work", -8 );
-                return -8;
-            }
-            LAPACKE_ztr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv,
-                               &v_t[(ncols_v-k)*ldv_t], ldv_t );
-            LAPACKE_zge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t,
-                               ldv_t );
-        }
+        LAPACKE_ztz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v,
+                           v, ldv, v_t, ldv_t );
         LAPACKE_zge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t );
         LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
         /* Call LAPACK function and adjust info */

From eba1112e38f774e7eca7300fe37cfc6a5f36f009 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 15:03:39 +0100
Subject: [PATCH 074/154] Add NaN check functions for trapezoidal matrices
 (Reference-LAPACK PR738+742)

---
 lapack-netlib/LAPACKE/utils/CMakeLists.txt    |  81 +++++-----
 lapack-netlib/LAPACKE/utils/Makefile          |   8 +
 .../LAPACKE/utils/lapacke_ctz_nancheck.c      | 144 +++++++++++++++++
 .../LAPACKE/utils/lapacke_ctz_trans.c         | 153 ++++++++++++++++++
 .../LAPACKE/utils/lapacke_dtz_nancheck.c      | 143 ++++++++++++++++
 .../LAPACKE/utils/lapacke_dtz_trans.c         | 153 ++++++++++++++++++
 .../LAPACKE/utils/lapacke_stz_nancheck.c      | 143 ++++++++++++++++
 .../LAPACKE/utils/lapacke_stz_trans.c         | 153 ++++++++++++++++++
 .../LAPACKE/utils/lapacke_ztz_nancheck.c      | 144 +++++++++++++++++
 .../LAPACKE/utils/lapacke_ztz_trans.c         | 153 ++++++++++++++++++
 10 files changed, 1238 insertions(+), 37 deletions(-)
 create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c
 create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c
 create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c
 create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c
 create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c
 create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c
 create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c
 create mode 100644 lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c

diff --git a/lapack-netlib/LAPACKE/utils/CMakeLists.txt b/lapack-netlib/LAPACKE/utils/CMakeLists.txt
index dd36ee33e..dfb9aa370 100644
--- a/lapack-netlib/LAPACKE/utils/CMakeLists.txt
+++ b/lapack-netlib/LAPACKE/utils/CMakeLists.txt
@@ -1,39 +1,46 @@
 set(UTILS
-lapacke_c_nancheck.c            lapacke_ctr_trans.c             lapacke_make_complex_float.c    lapacke_zgb_nancheck.c
-lapacke_cgb_nancheck.c          lapacke_d_nancheck.c            lapacke_s_nancheck.c            lapacke_zgb_trans.c
-lapacke_cgb_trans.c             lapacke_dgb_nancheck.c          lapacke_sgb_nancheck.c          lapacke_zge_nancheck.c
-lapacke_cge_nancheck.c          lapacke_dgb_trans.c             lapacke_sgb_trans.c             lapacke_zge_trans.c
-lapacke_cge_trans.c             lapacke_dge_nancheck.c          lapacke_sge_nancheck.c          lapacke_zgg_nancheck.c
-lapacke_cgg_nancheck.c          lapacke_dge_trans.c             lapacke_sge_trans.c             lapacke_zgg_trans.c
-lapacke_cgg_trans.c             lapacke_dgg_nancheck.c          lapacke_sgg_nancheck.c          lapacke_zgt_nancheck.c
-lapacke_cgt_nancheck.c          lapacke_dgg_trans.c             lapacke_sgg_trans.c             lapacke_zhb_nancheck.c
-lapacke_chb_nancheck.c          lapacke_dgt_nancheck.c          lapacke_sgt_nancheck.c          lapacke_zhb_trans.c
-lapacke_chb_trans.c             lapacke_dhs_nancheck.c          lapacke_shs_nancheck.c          lapacke_zhe_nancheck.c
-lapacke_che_nancheck.c          lapacke_dhs_trans.c             lapacke_shs_trans.c             lapacke_zhe_trans.c
-lapacke_che_trans.c             lapacke_dpb_nancheck.c          lapacke_spb_nancheck.c          lapacke_zhp_nancheck.c
-lapacke_chp_nancheck.c          lapacke_dpb_trans.c             lapacke_spb_trans.c             lapacke_zhp_trans.c
-lapacke_chp_trans.c             lapacke_dpf_nancheck.c          lapacke_spf_nancheck.c          lapacke_zhs_nancheck.c
-lapacke_chs_nancheck.c          lapacke_dpf_trans.c             lapacke_spf_trans.c             lapacke_zhs_trans.c
-lapacke_chs_trans.c             lapacke_dpo_nancheck.c          lapacke_spo_nancheck.c          lapacke_zpb_nancheck.c
-lapacke_cpb_nancheck.c          lapacke_dpo_trans.c             lapacke_spo_trans.c             lapacke_zpb_trans.c
-lapacke_cpb_trans.c             lapacke_dpp_nancheck.c          lapacke_spp_nancheck.c          lapacke_zpf_nancheck.c
-lapacke_cpf_nancheck.c          lapacke_dpp_trans.c             lapacke_spp_trans.c             lapacke_zpf_trans.c
-lapacke_cpf_trans.c             lapacke_dpt_nancheck.c          lapacke_spt_nancheck.c          lapacke_zpo_nancheck.c
-lapacke_cpo_nancheck.c          lapacke_dsb_nancheck.c          lapacke_ssb_nancheck.c          lapacke_zpo_trans.c
-lapacke_cpo_trans.c             lapacke_dsb_trans.c             lapacke_ssb_trans.c             lapacke_zpp_nancheck.c
-lapacke_cpp_nancheck.c          lapacke_dsp_nancheck.c          lapacke_ssp_nancheck.c          lapacke_zpp_trans.c
-lapacke_cpp_trans.c             lapacke_dsp_trans.c             lapacke_ssp_trans.c             lapacke_zpt_nancheck.c
-lapacke_cpt_nancheck.c          lapacke_dst_nancheck.c          lapacke_sst_nancheck.c          lapacke_zsp_nancheck.c
-lapacke_csp_nancheck.c          lapacke_dsy_nancheck.c          lapacke_ssy_nancheck.c          lapacke_zsp_trans.c
-lapacke_csp_trans.c             lapacke_dsy_trans.c             lapacke_ssy_trans.c             lapacke_zst_nancheck.c
-lapacke_cst_nancheck.c          lapacke_dtb_nancheck.c          lapacke_stb_nancheck.c          lapacke_zsy_nancheck.c
-lapacke_csy_nancheck.c          lapacke_dtb_trans.c             lapacke_stb_trans.c             lapacke_zsy_trans.c
-lapacke_csy_trans.c             lapacke_dtf_nancheck.c          lapacke_stf_nancheck.c          lapacke_ztb_nancheck.c
-lapacke_ctb_nancheck.c          lapacke_dtf_trans.c             lapacke_stf_trans.c             lapacke_ztb_trans.c
-lapacke_ctb_trans.c             lapacke_dtp_nancheck.c          lapacke_stp_nancheck.c          lapacke_ztf_nancheck.c
-lapacke_ctf_nancheck.c          lapacke_dtp_trans.c             lapacke_stp_trans.c             lapacke_ztf_trans.c
-lapacke_ctf_trans.c             lapacke_dtr_nancheck.c          lapacke_str_nancheck.c          lapacke_ztp_nancheck.c
-lapacke_ctp_nancheck.c          lapacke_dtr_trans.c             lapacke_str_trans.c             lapacke_ztp_trans.c
-lapacke_ctp_trans.c             lapacke_lsame.c                 lapacke_xerbla.c                lapacke_ztr_nancheck.c
-lapacke_ctr_nancheck.c          lapacke_make_complex_double.c   lapacke_z_nancheck.c            lapacke_ztr_trans.c
+lapacke_c_nancheck.c          lapacke_d_nancheck.c          lapacke_s_nancheck.c          lapacke_z_nancheck.c
+lapacke_cgb_nancheck.c        lapacke_dgb_nancheck.c        lapacke_sgb_nancheck.c        lapacke_zgb_trans.c
+lapacke_cgb_trans.c           lapacke_dgb_trans.c           lapacke_sgb_trans.c           lapacke_zgb_nancheck.c
+lapacke_cge_nancheck.c        lapacke_dge_nancheck.c        lapacke_sge_nancheck.c        lapacke_zge_nancheck.c
+lapacke_cge_trans.c           lapacke_dge_trans.c           lapacke_sge_trans.c           lapacke_zge_trans.c
+lapacke_cgg_nancheck.c        lapacke_dgg_nancheck.c        lapacke_sgg_nancheck.c        lapacke_zgg_nancheck.c
+lapacke_cgg_trans.c           lapacke_dgg_trans.c           lapacke_sgg_trans.c           lapacke_zgg_trans.c
+lapacke_cgt_nancheck.c        lapacke_dgt_nancheck.c        lapacke_sgt_nancheck.c        lapacke_zgt_nancheck.c
+lapacke_chb_nancheck.c        lapacke_dsb_nancheck.c        lapacke_ssb_nancheck.c        lapacke_zhb_nancheck.c
+lapacke_chb_trans.c           lapacke_dsb_trans.c           lapacke_ssb_trans.c           lapacke_zhb_trans.c
+lapacke_che_nancheck.c                                                                    lapacke_zhe_nancheck.c
+lapacke_che_trans.c                                                                       lapacke_zhe_trans.c
+lapacke_chp_nancheck.c                                                                    lapacke_zhp_nancheck.c
+lapacke_chp_trans.c                                                                       lapacke_zhp_trans.c
+lapacke_chs_nancheck.c        lapacke_dhs_nancheck.c        lapacke_shs_nancheck.c        lapacke_zhs_nancheck.c
+lapacke_chs_trans.c           lapacke_dhs_trans.c           lapacke_shs_trans.c           lapacke_zhs_trans.c
+lapacke_cpb_nancheck.c        lapacke_dpb_nancheck.c        lapacke_spb_nancheck.c        lapacke_zpb_nancheck.c
+lapacke_cpb_trans.c           lapacke_dpb_trans.c           lapacke_spb_trans.c           lapacke_zpb_trans.c
+lapacke_cpf_nancheck.c        lapacke_dpf_nancheck.c        lapacke_spf_nancheck.c        lapacke_zpf_nancheck.c
+lapacke_cpf_trans.c           lapacke_dpf_trans.c           lapacke_spf_trans.c           lapacke_zpf_trans.c
+lapacke_cpo_nancheck.c        lapacke_dpo_nancheck.c        lapacke_spo_nancheck.c        lapacke_zpo_nancheck.c
+lapacke_cpo_trans.c           lapacke_dpo_trans.c           lapacke_spo_trans.c           lapacke_zpo_trans.c
+lapacke_cpp_nancheck.c        lapacke_dpp_nancheck.c        lapacke_spp_nancheck.c        lapacke_zpp_nancheck.c
+lapacke_cpp_trans.c           lapacke_dpp_trans.c           lapacke_spp_trans.c           lapacke_zpp_trans.c
+lapacke_cpt_nancheck.c        lapacke_dpt_nancheck.c        lapacke_spt_nancheck.c        lapacke_zpt_nancheck.c
+lapacke_csp_nancheck.c        lapacke_dsp_nancheck.c        lapacke_ssp_nancheck.c        lapacke_zsp_nancheck.c
+lapacke_csp_trans.c           lapacke_dsp_trans.c           lapacke_ssp_trans.c           lapacke_zsp_trans.c
+lapacke_cst_nancheck.c        lapacke_dst_nancheck.c        lapacke_sst_nancheck.c        lapacke_zst_nancheck.c
+lapacke_csy_nancheck.c        lapacke_dsy_nancheck.c        lapacke_ssy_nancheck.c        lapacke_zsy_nancheck.c
+lapacke_csy_trans.c           lapacke_dsy_trans.c           lapacke_ssy_trans.c           lapacke_zsy_trans.c
+lapacke_ctb_nancheck.c        lapacke_dtb_nancheck.c        lapacke_stb_nancheck.c        lapacke_ztb_nancheck.c
+lapacke_ctb_trans.c           lapacke_dtb_trans.c           lapacke_stb_trans.c           lapacke_ztb_trans.c
+lapacke_ctf_nancheck.c        lapacke_dtf_nancheck.c        lapacke_stf_nancheck.c        lapacke_ztf_nancheck.c
+lapacke_ctf_trans.c           lapacke_dtf_trans.c           lapacke_stf_trans.c           lapacke_ztf_trans.c
+lapacke_ctp_nancheck.c        lapacke_dtp_nancheck.c        lapacke_stp_nancheck.c        lapacke_ztp_nancheck.c
+lapacke_ctp_trans.c           lapacke_dtp_trans.c           lapacke_stp_trans.c           lapacke_ztp_trans.c
+lapacke_ctr_nancheck.c        lapacke_dtr_nancheck.c        lapacke_str_nancheck.c        lapacke_ztr_nancheck.c
+lapacke_ctr_trans.c           lapacke_dtr_trans.c           lapacke_str_trans.c           lapacke_ztr_trans.c
+lapacke_ctz_nancheck.c        lapacke_dtz_nancheck.c        lapacke_stz_nancheck.c        lapacke_ztz_nancheck.c
+lapacke_ctz_trans.c           lapacke_dtz_trans.c           lapacke_stz_trans.c           lapacke_ztz_trans.c
+
+lapacke_make_complex_float.c                                                              lapacke_make_complex_double.c
+lapacke_lsame.c
+lapacke_xerbla.c
 )
diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile
index adc573650..a1f863107 100644
--- a/lapack-netlib/LAPACKE/utils/Makefile
+++ b/lapack-netlib/LAPACKE/utils/Makefile
@@ -76,6 +76,8 @@ OBJ = lapacke_cgb_nancheck.o \
       lapacke_ctp_trans.o \
       lapacke_ctr_nancheck.o \
       lapacke_ctr_trans.o \
+      lapacke_ctz_nancheck.o \
+      lapacke_ctz_trans.o \
       lapacke_dgb_nancheck.o \
       lapacke_dgb_trans.o \
       lapacke_dge_nancheck.o \
@@ -110,6 +112,8 @@ OBJ = lapacke_cgb_nancheck.o \
       lapacke_dtp_trans.o \
       lapacke_dtr_nancheck.o \
       lapacke_dtr_trans.o \
+      lapacke_dtz_nancheck.o \
+      lapacke_dtz_trans.o \
       lapacke_lsame.o \
       lapacke_sgb_nancheck.o \
       lapacke_sgb_trans.o \
@@ -145,6 +149,8 @@ OBJ = lapacke_cgb_nancheck.o \
       lapacke_stp_trans.o \
       lapacke_str_nancheck.o \
       lapacke_str_trans.o \
+      lapacke_stz_nancheck.o \
+      lapacke_stz_trans.o \
       lapacke_xerbla.o \
       lapacke_zgb_nancheck.o \
       lapacke_zgb_trans.o \
@@ -184,6 +190,8 @@ OBJ = lapacke_cgb_nancheck.o \
       lapacke_ztp_trans.o \
       lapacke_ztr_nancheck.o \
       lapacke_ztr_trans.o \
+      lapacke_ztz_nancheck.o \
+      lapacke_ztz_trans.o \
       lapacke_make_complex_float.o \
       lapacke_make_complex_double.o
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c
new file mode 100644
index 000000000..bea956781
--- /dev/null
+++ b/lapack-netlib/LAPACKE/utils/lapacke_ctz_nancheck.c
@@ -0,0 +1,144 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK utility function
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+/*****************************************************************************
+  Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal
+  matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses
+  the diagonal which shall be considered and `uplo` tells us whether we use the
+  upper or lower part of the matrix with respect to the chosen diagonal.
+
+      Diagonals 'F' (front / forward) and 'B' (back / backward):
+
+        A = ( F       )           A = ( F     B       )
+            (    F    )               (    F     B    )
+            ( B     F )               (       F     B )
+            (    B    )
+            (       B )
+
+      direct = 'F', uplo = 'L':
+
+        A = ( *       )           A = ( *             )
+            ( *  *    )               ( *  *          )
+            ( *  *  * )               ( *  *  *       )
+            ( *  *  * )
+            ( *  *  * )
+
+      direct = 'F', uplo = 'U':
+
+        A = ( *  *  * )           A = ( *  *  *  *  * )
+            (    *  * )               (    *  *  *  * )
+            (       * )               (       *  *  * )
+            (         )
+            (         )
+
+      direct = 'B', uplo = 'L':
+
+        A = (         )           A = ( *  *  *       )
+            (         )               ( *  *  *  *    )
+            ( *       )               ( *  *  *  *  * )
+            ( *  *    )
+            ( *  *  * )
+
+      direct = 'B', uplo = 'U':
+
+        A = ( *  *  * )           A = (       *  *  * )
+            ( *  *  * )               (          *  * )
+            ( *  *  * )               (             * )
+            (    *  * )
+            (       * )
+
+*****************************************************************************/
+
+lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo,
+                                     char diag, lapack_int m, lapack_int n,
+                                     const lapack_complex_float *a,
+                                     lapack_int lda )
+{
+    lapack_logical colmaj, front, lower, unit;
+
+    if( a == NULL ) return (lapack_logical) 0;
+
+    colmaj = ( matrix_layout == LAPACK_COL_MAJOR );
+    front  = LAPACKE_lsame( direct, 'f' );
+    lower  = LAPACKE_lsame( uplo, 'l' );
+    unit   = LAPACKE_lsame( diag, 'u' );
+
+    if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) ||
+        ( !front  && !LAPACKE_lsame( direct, 'b' ) ) ||
+        ( !lower  && !LAPACKE_lsame( uplo, 'u' ) ) ||
+        ( !unit   && !LAPACKE_lsame( diag, 'n' ) ) ) {
+        /* Just exit if any of input parameters are wrong */
+        return (lapack_logical) 0;
+    }
+
+    /* Initial offsets and sizes of triangular and rectangular parts */
+    lapack_int tri_offset = 0;
+    lapack_int tri_n = MIN(m,n);
+    lapack_int rect_offset = -1;
+    lapack_int rect_m = ( m > n ) ? m - n : m;
+    lapack_int rect_n = ( n > m ) ? n - m : n;
+
+    /* Fix offsets depending on the shape of the matrix */
+    if( front ) {
+        if( lower && m > n ) {
+            rect_offset = tri_n * ( !colmaj ? lda : 1 );
+        } else if( !lower && n > m ) {
+            rect_offset = tri_n * ( colmaj ? lda : 1 );
+        }
+    } else {
+        if( m > n ) {
+            tri_offset = rect_m * ( !colmaj ? lda : 1 );
+            if( !lower ) {
+                rect_offset = 0;
+            }
+        } else if( n > m ) {
+            tri_offset = rect_n * ( colmaj ? lda : 1 );
+            if( lower ) {
+                rect_offset = 0;
+            }
+        }
+    }
+
+    /* Check rectangular part */
+    if( rect_offset >= 0 ) {
+        if( LAPACKE_cge_nancheck( matrix_layout, rect_m, rect_n,
+                                  &a[rect_offset], lda) ) {
+            return (lapack_logical) 1;
+        }
+    }
+
+    /* Check triangular part */
+    return LAPACKE_ctr_nancheck( matrix_layout, uplo, diag, tri_n,
+                                 &a[tri_offset], lda );
+}
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c
new file mode 100644
index 000000000..8910aee7d
--- /dev/null
+++ b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c
@@ -0,0 +1,153 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK utility function
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+/*****************************************************************************
+  Converts input triangular matrix from row-major(C) to column-major(Fortran)
+  layout or vice versa. The shape of the trapezoidal matrix is determined by
+  the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall
+  be considered and `uplo` tells us whether we use the upper or lower part of
+  the matrix with respect to the chosen diagonal.
+
+      Diagonals 'F' (front / forward) and 'B' (back / backward):
+
+        A = ( F       )           A = ( F     B       )
+            (    F    )               (    F     B    )
+            ( B     F )               (       F     B )
+            (    B    )
+            (       B )
+
+      direct = 'F', uplo = 'L':
+
+        A = ( *       )           A = ( *             )
+            ( *  *    )               ( *  *          )
+            ( *  *  * )               ( *  *  *       )
+            ( *  *  * )
+            ( *  *  * )
+
+      direct = 'F', uplo = 'U':
+
+        A = ( *  *  * )           A = ( *  *  *  *  * )
+            (    *  * )               (    *  *  *  * )
+            (       * )               (       *  *  * )
+            (         )
+            (         )
+
+      direct = 'B', uplo = 'L':
+
+        A = (         )           A = ( *  *  *       )
+            (         )               ( *  *  *  *    )
+            ( *       )               ( *  *  *  *  * )
+            ( *  *    )
+            ( *  *  * )
+
+      direct = 'B', uplo = 'U':
+
+        A = ( *  *  * )           A = (       *  *  * )
+            ( *  *  * )               (          *  * )
+            ( *  *  * )               (             * )
+            (    *  * )
+            (       * )
+
+*****************************************************************************/
+
+void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo,
+                        char diag, lapack_int m, lapack_int n,
+                        const lapack_complex_float *in, lapack_int ldin,
+                        lapack_complex_float *out, lapack_int ldout )
+{
+    lapack_logical colmaj, front, lower, unit;
+
+    if( in == NULL || out == NULL ) return ;
+
+    colmaj = ( matrix_layout == LAPACK_COL_MAJOR );
+    front  = LAPACKE_lsame( direct, 'f' );
+    lower  = LAPACKE_lsame( uplo, 'l' );
+    unit   = LAPACKE_lsame( diag, 'u' );
+
+    if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) ||
+        ( !front  && !LAPACKE_lsame( direct, 'b' ) ) ||
+        ( !lower  && !LAPACKE_lsame( uplo, 'u' ) ) ||
+        ( !unit   && !LAPACKE_lsame( diag, 'n' ) ) ) {
+        /* Just exit if any of input parameters are wrong */
+        return;
+    }
+
+    /* Initial offsets and sizes of triangular and rectangular parts */
+    lapack_int tri_in_offset = 0;
+    lapack_int tri_out_offset = 0;
+    lapack_int tri_n = MIN(m,n);
+    lapack_int rect_in_offset = -1;
+    lapack_int rect_out_offset = -1;
+    lapack_int rect_m = ( m > n ) ? m - n : m;
+    lapack_int rect_n = ( n > m ) ? n - m : n;
+
+    /* Fix offsets depending on the shape of the matrix */
+    if( front ) {
+        if( lower && m > n ) {
+            rect_in_offset = tri_n * ( !colmaj ? ldin : 1 );
+            rect_out_offset = tri_n * ( colmaj ? ldout : 1 );
+        } else if( !lower && n > m ) {
+            rect_in_offset = tri_n * ( colmaj ? ldin : 1 );
+            rect_out_offset = tri_n * ( !colmaj ? ldout : 1 );
+        }
+    } else {
+        if( m > n ) {
+            tri_in_offset = rect_m * ( !colmaj ? ldin : 1 );
+            tri_out_offset = rect_m * ( colmaj ? ldout : 1 );
+            if( !lower ) {
+                rect_in_offset = 0;
+                rect_out_offset = 0;
+            }
+        } else if( n > m ) {
+            tri_in_offset = rect_n * ( colmaj ? ldin : 1 );
+            tri_out_offset = rect_n * ( !colmaj ? ldout : 1 );
+            if( lower ) {
+                rect_in_offset = 0;
+                rect_out_offset = 0;
+            }
+        }
+    }
+
+    /* Copy & transpose rectangular part */
+    if( rect_in_offset >= 0 && rect_out_offset >= 0 ) {
+        LAPACKE_cge_trans( matrix_layout, rect_m, rect_n,
+                           &in[rect_in_offset], ldin,
+                           &out[rect_out_offset], ldout );
+    }
+
+    /* Copy & transpose triangular part */
+    return LAPACKE_ctr_trans( matrix_layout, uplo, diag, tri_n,
+                              &in[tri_in_offset], ldin,
+                              &out[tri_out_offset], ldout );
+}
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c
new file mode 100644
index 000000000..cd2ae6731
--- /dev/null
+++ b/lapack-netlib/LAPACKE/utils/lapacke_dtz_nancheck.c
@@ -0,0 +1,143 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK utility function
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+/*****************************************************************************
+  Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal
+  matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses
+  the diagonal which shall be considered and `uplo` tells us whether we use the
+  upper or lower part of the matrix with respect to the chosen diagonal.
+
+      Diagonals 'F' (front / forward) and 'B' (back / backward):
+
+        A = ( F       )           A = ( F     B       )
+            (    F    )               (    F     B    )
+            ( B     F )               (       F     B )
+            (    B    )
+            (       B )
+
+      direct = 'F', uplo = 'L':
+
+        A = ( *       )           A = ( *             )
+            ( *  *    )               ( *  *          )
+            ( *  *  * )               ( *  *  *       )
+            ( *  *  * )
+            ( *  *  * )
+
+      direct = 'F', uplo = 'U':
+
+        A = ( *  *  * )           A = ( *  *  *  *  * )
+            (    *  * )               (    *  *  *  * )
+            (       * )               (       *  *  * )
+            (         )
+            (         )
+
+      direct = 'B', uplo = 'L':
+
+        A = (         )           A = ( *  *  *       )
+            (         )               ( *  *  *  *    )
+            ( *       )               ( *  *  *  *  * )
+            ( *  *    )
+            ( *  *  * )
+
+      direct = 'B', uplo = 'U':
+
+        A = ( *  *  * )           A = (       *  *  * )
+            ( *  *  * )               (          *  * )
+            ( *  *  * )               (             * )
+            (    *  * )
+            (       * )
+
+*****************************************************************************/
+
+lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo,
+                                     char diag, lapack_int m, lapack_int n,
+                                     const double *a, lapack_int lda )
+{
+    lapack_logical colmaj, front, lower, unit;
+
+    if( a == NULL ) return (lapack_logical) 0;
+
+    colmaj = ( matrix_layout == LAPACK_COL_MAJOR );
+    front  = LAPACKE_lsame( direct, 'f' );
+    lower  = LAPACKE_lsame( uplo, 'l' );
+    unit   = LAPACKE_lsame( diag, 'u' );
+
+    if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) ||
+        ( !front  && !LAPACKE_lsame( direct, 'b' ) ) ||
+        ( !lower  && !LAPACKE_lsame( uplo, 'u' ) ) ||
+        ( !unit   && !LAPACKE_lsame( diag, 'n' ) ) ) {
+        /* Just exit if any of input parameters are wrong */
+        return (lapack_logical) 0;
+    }
+
+    /* Initial offsets and sizes of triangular and rectangular parts */
+    lapack_int tri_offset = 0;
+    lapack_int tri_n = MIN(m,n);
+    lapack_int rect_offset = -1;
+    lapack_int rect_m = ( m > n ) ? m - n : m;
+    lapack_int rect_n = ( n > m ) ? n - m : n;
+
+    /* Fix offsets depending on the shape of the matrix */
+    if( front ) {
+        if( lower && m > n ) {
+            rect_offset = tri_n * ( !colmaj ? lda : 1 );
+        } else if( !lower && n > m ) {
+            rect_offset = tri_n * ( colmaj ? lda : 1 );
+        }
+    } else {
+        if( m > n ) {
+            tri_offset = rect_m * ( !colmaj ? lda : 1 );
+            if( !lower ) {
+                rect_offset = 0;
+            }
+        } else if( n > m ) {
+            tri_offset = rect_n * ( colmaj ? lda : 1 );
+            if( lower ) {
+                rect_offset = 0;
+            }
+        }
+    }
+
+    /* Check rectangular part */
+    if( rect_offset >= 0 ) {
+        if( LAPACKE_dge_nancheck( matrix_layout, rect_m, rect_n,
+                                  &a[rect_offset], lda ) ) {
+            return (lapack_logical) 1;
+        }
+    }
+
+    /* Check triangular part */
+    return LAPACKE_dtr_nancheck( matrix_layout, uplo, diag, tri_n,
+                                 &a[tri_offset], lda );
+}
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c
new file mode 100644
index 000000000..80d94ead9
--- /dev/null
+++ b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c
@@ -0,0 +1,153 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK utility function
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+/*****************************************************************************
+  Converts input triangular matrix from row-major(C) to column-major(Fortran)
+  layout or vice versa. The shape of the trapezoidal matrix is determined by
+  the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall
+  be considered and `uplo` tells us whether we use the upper or lower part of
+  the matrix with respect to the chosen diagonal.
+
+      Diagonals 'F' (front / forward) and 'B' (back / backward):
+
+        A = ( F       )           A = ( F     B       )
+            (    F    )               (    F     B    )
+            ( B     F )               (       F     B )
+            (    B    )
+            (       B )
+
+      direct = 'F', uplo = 'L':
+
+        A = ( *       )           A = ( *             )
+            ( *  *    )               ( *  *          )
+            ( *  *  * )               ( *  *  *       )
+            ( *  *  * )
+            ( *  *  * )
+
+      direct = 'F', uplo = 'U':
+
+        A = ( *  *  * )           A = ( *  *  *  *  * )
+            (    *  * )               (    *  *  *  * )
+            (       * )               (       *  *  * )
+            (         )
+            (         )
+
+      direct = 'B', uplo = 'L':
+
+        A = (         )           A = ( *  *  *       )
+            (         )               ( *  *  *  *    )
+            ( *       )               ( *  *  *  *  * )
+            ( *  *    )
+            ( *  *  * )
+
+      direct = 'B', uplo = 'U':
+
+        A = ( *  *  * )           A = (       *  *  * )
+            ( *  *  * )               (          *  * )
+            ( *  *  * )               (             * )
+            (    *  * )
+            (       * )
+
+*****************************************************************************/
+
+void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo,
+                        char diag, lapack_int m, lapack_int n,
+                        const double *in, lapack_int ldin,
+                        double *out, lapack_int ldout )
+{
+    lapack_logical colmaj, front, lower, unit;
+
+    if( in == NULL || out == NULL ) return ;
+
+    colmaj = ( matrix_layout == LAPACK_COL_MAJOR );
+    front  = LAPACKE_lsame( direct, 'f' );
+    lower  = LAPACKE_lsame( uplo, 'l' );
+    unit   = LAPACKE_lsame( diag, 'u' );
+
+    if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) ||
+        ( !front  && !LAPACKE_lsame( direct, 'b' ) ) ||
+        ( !lower  && !LAPACKE_lsame( uplo, 'u' ) ) ||
+        ( !unit   && !LAPACKE_lsame( diag, 'n' ) ) ) {
+        /* Just exit if any of input parameters are wrong */
+        return;
+    }
+
+    /* Initial offsets and sizes of triangular and rectangular parts */
+    lapack_int tri_in_offset = 0;
+    lapack_int tri_out_offset = 0;
+    lapack_int tri_n = MIN(m,n);
+    lapack_int rect_in_offset = -1;
+    lapack_int rect_out_offset = -1;
+    lapack_int rect_m = ( m > n ) ? m - n : m;
+    lapack_int rect_n = ( n > m ) ? n - m : n;
+
+    /* Fix offsets depending on the shape of the matrix */
+    if( front ) {
+        if( lower && m > n ) {
+            rect_in_offset = tri_n * ( !colmaj ? ldin : 1 );
+            rect_out_offset = tri_n * ( colmaj ? ldout : 1 );
+        } else if( !lower && n > m ) {
+            rect_in_offset = tri_n * ( colmaj ? ldin : 1 );
+            rect_out_offset = tri_n * ( !colmaj ? ldout : 1 );
+        }
+    } else {
+        if( m > n ) {
+            tri_in_offset = rect_m * ( !colmaj ? ldin : 1 );
+            tri_out_offset = rect_m * ( colmaj ? ldout : 1 );
+            if( !lower ) {
+                rect_in_offset = 0;
+                rect_out_offset = 0;
+            }
+        } else if( n > m ) {
+            tri_in_offset = rect_n * ( colmaj ? ldin : 1 );
+            tri_out_offset = rect_n * ( !colmaj ? ldout : 1 );
+            if( lower ) {
+                rect_in_offset = 0;
+                rect_out_offset = 0;
+            }
+        }
+    }
+
+    /* Copy & transpose rectangular part */
+    if( rect_in_offset >= 0 && rect_out_offset >= 0 ) {
+        LAPACKE_dge_trans( matrix_layout, rect_m, rect_n,
+                           &in[rect_in_offset], ldin,
+                           &out[rect_out_offset], ldout );
+    }
+
+    /* Copy & transpose triangular part */
+    return LAPACKE_dtr_trans( matrix_layout, uplo, diag, tri_n,
+                              &in[tri_in_offset], ldin,
+                              &out[tri_out_offset], ldout );
+}
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c
new file mode 100644
index 000000000..7d7c30f96
--- /dev/null
+++ b/lapack-netlib/LAPACKE/utils/lapacke_stz_nancheck.c
@@ -0,0 +1,143 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK utility function
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+/*****************************************************************************
+  Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal
+  matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses
+  the diagonal which shall be considered and `uplo` tells us whether we use the
+  upper or lower part of the matrix with respect to the chosen diagonal.
+
+      Diagonals 'F' (front / forward) and 'B' (back / backward):
+
+        A = ( F       )           A = ( F     B       )
+            (    F    )               (    F     B    )
+            ( B     F )               (       F     B )
+            (    B    )
+            (       B )
+
+      direct = 'F', uplo = 'L':
+
+        A = ( *       )           A = ( *             )
+            ( *  *    )               ( *  *          )
+            ( *  *  * )               ( *  *  *       )
+            ( *  *  * )
+            ( *  *  * )
+
+      direct = 'F', uplo = 'U':
+
+        A = ( *  *  * )           A = ( *  *  *  *  * )
+            (    *  * )               (    *  *  *  * )
+            (       * )               (       *  *  * )
+            (         )
+            (         )
+
+      direct = 'B', uplo = 'L':
+
+        A = (         )           A = ( *  *  *       )
+            (         )               ( *  *  *  *    )
+            ( *       )               ( *  *  *  *  * )
+            ( *  *    )
+            ( *  *  * )
+
+      direct = 'B', uplo = 'U':
+
+        A = ( *  *  * )           A = (       *  *  * )
+            ( *  *  * )               (          *  * )
+            ( *  *  * )               (             * )
+            (    *  * )
+            (       * )
+
+*****************************************************************************/
+
+lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo,
+                                     char diag, lapack_int m, lapack_int n,
+                                     const float *a, lapack_int lda )
+{
+    lapack_logical colmaj, front, lower, unit;
+
+    if( a == NULL ) return (lapack_logical) 0;
+
+    colmaj = ( matrix_layout == LAPACK_COL_MAJOR );
+    front  = LAPACKE_lsame( direct, 'f' );
+    lower  = LAPACKE_lsame( uplo, 'l' );
+    unit   = LAPACKE_lsame( diag, 'u' );
+
+    if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) ||
+        ( !front  && !LAPACKE_lsame( direct, 'b' ) ) ||
+        ( !lower  && !LAPACKE_lsame( uplo, 'u' ) ) ||
+        ( !unit   && !LAPACKE_lsame( diag, 'n' ) ) ) {
+        /* Just exit if any of input parameters are wrong */
+        return (lapack_logical) 0;
+    }
+
+    /* Initial offsets and sizes of triangular and rectangular parts */
+    lapack_int tri_offset = 0;
+    lapack_int tri_n = MIN(m,n);
+    lapack_int rect_offset = -1;
+    lapack_int rect_m = ( m > n ) ? m - n : m;
+    lapack_int rect_n = ( n > m ) ? n - m : n;
+
+    /* Fix offsets depending on the shape of the matrix */
+    if( front ) {
+        if( lower && m > n ) {
+            rect_offset = tri_n * ( !colmaj ? lda : 1 );
+        } else if( !lower && n > m ) {
+            rect_offset = tri_n * ( colmaj ? lda : 1 );
+        }
+    } else {
+        if( m > n ) {
+            tri_offset = rect_m * ( !colmaj ? lda : 1 );
+            if( !lower ) {
+                rect_offset = 0;
+            }
+        } else if( n > m ) {
+            tri_offset = rect_n * ( colmaj ? lda : 1 );
+            if( lower ) {
+                rect_offset = 0;
+            }
+        }
+    }
+
+    /* Check rectangular part */
+    if( rect_offset >= 0 ) {
+        if( LAPACKE_sge_nancheck( matrix_layout, rect_m, rect_n,
+                                  &a[rect_offset], lda) ) {
+            return (lapack_logical) 1;
+        }
+    }
+
+    /* Check triangular part */
+    return LAPACKE_str_nancheck( matrix_layout, uplo, diag, tri_n,
+                                 &a[tri_offset], lda );
+}
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c
new file mode 100644
index 000000000..793f3833d
--- /dev/null
+++ b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c
@@ -0,0 +1,153 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK utility function
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+/*****************************************************************************
+  Converts input triangular matrix from row-major(C) to column-major(Fortran)
+  layout or vice versa. The shape of the trapezoidal matrix is determined by
+  the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall
+  be considered and `uplo` tells us whether we use the upper or lower part of
+  the matrix with respect to the chosen diagonal.
+
+      Diagonals 'F' (front / forward) and 'B' (back / backward):
+
+        A = ( F       )           A = ( F     B       )
+            (    F    )               (    F     B    )
+            ( B     F )               (       F     B )
+            (    B    )
+            (       B )
+
+      direct = 'F', uplo = 'L':
+
+        A = ( *       )           A = ( *             )
+            ( *  *    )               ( *  *          )
+            ( *  *  * )               ( *  *  *       )
+            ( *  *  * )
+            ( *  *  * )
+
+      direct = 'F', uplo = 'U':
+
+        A = ( *  *  * )           A = ( *  *  *  *  * )
+            (    *  * )               (    *  *  *  * )
+            (       * )               (       *  *  * )
+            (         )
+            (         )
+
+      direct = 'B', uplo = 'L':
+
+        A = (         )           A = ( *  *  *       )
+            (         )               ( *  *  *  *    )
+            ( *       )               ( *  *  *  *  * )
+            ( *  *    )
+            ( *  *  * )
+
+      direct = 'B', uplo = 'U':
+
+        A = ( *  *  * )           A = (       *  *  * )
+            ( *  *  * )               (          *  * )
+            ( *  *  * )               (             * )
+            (    *  * )
+            (       * )
+
+*****************************************************************************/
+
+void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo,
+                        char diag, lapack_int m, lapack_int n,
+                        const float *in, lapack_int ldin,
+                        float *out, lapack_int ldout )
+{
+    lapack_logical colmaj, front, lower, unit;
+
+    if( in == NULL || out == NULL ) return ;
+
+    colmaj = ( matrix_layout == LAPACK_COL_MAJOR );
+    front  = LAPACKE_lsame( direct, 'f' );
+    lower  = LAPACKE_lsame( uplo, 'l' );
+    unit   = LAPACKE_lsame( diag, 'u' );
+
+    if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) ||
+        ( !front  && !LAPACKE_lsame( direct, 'b' ) ) ||
+        ( !lower  && !LAPACKE_lsame( uplo, 'u' ) ) ||
+        ( !unit   && !LAPACKE_lsame( diag, 'n' ) ) ) {
+        /* Just exit if any of input parameters are wrong */
+        return;
+    }
+
+    /* Initial offsets and sizes of triangular and rectangular parts */
+    lapack_int tri_in_offset = 0;
+    lapack_int tri_out_offset = 0;
+    lapack_int tri_n = MIN(m,n);
+    lapack_int rect_in_offset = -1;
+    lapack_int rect_out_offset = -1;
+    lapack_int rect_m = ( m > n ) ? m - n : m;
+    lapack_int rect_n = ( n > m ) ? n - m : n;
+
+    /* Fix offsets depending on the shape of the matrix */
+    if( front ) {
+        if( lower && m > n ) {
+            rect_in_offset = tri_n * ( !colmaj ? ldin : 1 );
+            rect_out_offset = tri_n * ( colmaj ? ldout : 1 );
+        } else if( !lower && n > m ) {
+            rect_in_offset = tri_n * ( colmaj ? ldin : 1 );
+            rect_out_offset = tri_n * ( !colmaj ? ldout : 1 );
+        }
+    } else {
+        if( m > n ) {
+            tri_in_offset = rect_m * ( !colmaj ? ldin : 1 );
+            tri_out_offset = rect_m * ( colmaj ? ldout : 1 );
+            if( !lower ) {
+                rect_in_offset = 0;
+                rect_out_offset = 0;
+            }
+        } else if( n > m ) {
+            tri_in_offset = rect_n * ( colmaj ? ldin : 1 );
+            tri_out_offset = rect_n * ( !colmaj ? ldout : 1 );
+            if( lower ) {
+                rect_in_offset = 0;
+                rect_out_offset = 0;
+            }
+        }
+    }
+
+    /* Copy & transpose rectangular part */
+    if( rect_in_offset >= 0 && rect_out_offset >= 0 ) {
+        LAPACKE_sge_trans( matrix_layout, rect_m, rect_n,
+                           &in[rect_in_offset], ldin,
+                           &out[rect_out_offset], ldout );
+    }
+
+    /* Copy & transpose triangular part */
+    return LAPACKE_str_trans( matrix_layout, uplo, diag, tri_n,
+                              &in[tri_in_offset], ldin,
+                              &out[tri_out_offset], ldout );
+}
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c
new file mode 100644
index 000000000..481fa4c03
--- /dev/null
+++ b/lapack-netlib/LAPACKE/utils/lapacke_ztz_nancheck.c
@@ -0,0 +1,144 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK utility function
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+/*****************************************************************************
+  Check a trapezoidal matrix for NaN entries. The shape of the trapezoidal
+  matrix is determined by the arguments `direct` and `uplo`. `Direct` chooses
+  the diagonal which shall be considered and `uplo` tells us whether we use the
+  upper or lower part of the matrix with respect to the chosen diagonal.
+
+      Diagonals 'F' (front / forward) and 'B' (back / backward):
+
+        A = ( F       )           A = ( F     B       )
+            (    F    )               (    F     B    )
+            ( B     F )               (       F     B )
+            (    B    )
+            (       B )
+
+      direct = 'F', uplo = 'L':
+
+        A = ( *       )           A = ( *             )
+            ( *  *    )               ( *  *          )
+            ( *  *  * )               ( *  *  *       )
+            ( *  *  * )
+            ( *  *  * )
+
+      direct = 'F', uplo = 'U':
+
+        A = ( *  *  * )           A = ( *  *  *  *  * )
+            (    *  * )               (    *  *  *  * )
+            (       * )               (       *  *  * )
+            (         )
+            (         )
+
+      direct = 'B', uplo = 'L':
+
+        A = (         )           A = ( *  *  *       )
+            (         )               ( *  *  *  *    )
+            ( *       )               ( *  *  *  *  * )
+            ( *  *    )
+            ( *  *  * )
+
+      direct = 'B', uplo = 'U':
+
+        A = ( *  *  * )           A = (       *  *  * )
+            ( *  *  * )               (          *  * )
+            ( *  *  * )               (             * )
+            (    *  * )
+            (       * )
+
+*****************************************************************************/
+
+lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo,
+                                     char diag, lapack_int m, lapack_int n,
+                                     const lapack_complex_double *a,
+                                     lapack_int lda )
+{
+    lapack_logical colmaj, front, lower, unit;
+
+    if( a == NULL ) return (lapack_logical) 0;
+
+    colmaj = ( matrix_layout == LAPACK_COL_MAJOR );
+    front  = LAPACKE_lsame( direct, 'f' );
+    lower  = LAPACKE_lsame( uplo, 'l' );
+    unit   = LAPACKE_lsame( diag, 'u' );
+
+    if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) ||
+        ( !front  && !LAPACKE_lsame( direct, 'b' ) ) ||
+        ( !lower  && !LAPACKE_lsame( uplo, 'u' ) ) ||
+        ( !unit   && !LAPACKE_lsame( diag, 'n' ) ) ) {
+        /* Just exit if any of input parameters are wrong */
+        return (lapack_logical) 0;
+    }
+
+    /* Initial offsets and sizes of triangular and rectangular parts */
+    lapack_int tri_offset = 0;
+    lapack_int tri_n = MIN(m,n);
+    lapack_int rect_offset = -1;
+    lapack_int rect_m = ( m > n ) ? m - n : m;
+    lapack_int rect_n = ( n > m ) ? n - m : n;
+
+    /* Fix offsets depending on the shape of the matrix */
+    if( front ) {
+        if( lower && m > n ) {
+            rect_offset = tri_n * ( !colmaj ? lda : 1 );
+        } else if( !lower && n > m ) {
+            rect_offset = tri_n * ( colmaj ? lda : 1 );
+        }
+    } else {
+        if( m > n ) {
+            tri_offset = rect_m * ( !colmaj ? lda : 1 );
+            if( !lower ) {
+                rect_offset = 0;
+            }
+        } else if( n > m ) {
+            tri_offset = rect_n * ( colmaj ? lda : 1 );
+            if( lower ) {
+                rect_offset = 0;
+            }
+        }
+    }
+
+    /* Check rectangular part */
+    if( rect_offset >= 0 ) {
+        if( LAPACKE_zge_nancheck( matrix_layout, rect_m, rect_n,
+                                  &a[rect_offset], lda) ) {
+            return (lapack_logical) 1;
+        }
+    }
+
+    /* Check triangular part */
+    return LAPACKE_ztr_nancheck( matrix_layout, uplo, diag, tri_n,
+                                 &a[tri_offset], lda );
+}
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c
new file mode 100644
index 000000000..881052331
--- /dev/null
+++ b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c
@@ -0,0 +1,153 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK utility function
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+/*****************************************************************************
+  Converts input triangular matrix from row-major(C) to column-major(Fortran)
+  layout or vice versa. The shape of the trapezoidal matrix is determined by
+  the arguments `direct` and `uplo`. `Direct` chooses the diagonal which shall
+  be considered and `uplo` tells us whether we use the upper or lower part of
+  the matrix with respect to the chosen diagonal.
+
+      Diagonals 'F' (front / forward) and 'B' (back / backward):
+
+        A = ( F       )           A = ( F     B       )
+            (    F    )               (    F     B    )
+            ( B     F )               (       F     B )
+            (    B    )
+            (       B )
+
+      direct = 'F', uplo = 'L':
+
+        A = ( *       )           A = ( *             )
+            ( *  *    )               ( *  *          )
+            ( *  *  * )               ( *  *  *       )
+            ( *  *  * )
+            ( *  *  * )
+
+      direct = 'F', uplo = 'U':
+
+        A = ( *  *  * )           A = ( *  *  *  *  * )
+            (    *  * )               (    *  *  *  * )
+            (       * )               (       *  *  * )
+            (         )
+            (         )
+
+      direct = 'B', uplo = 'L':
+
+        A = (         )           A = ( *  *  *       )
+            (         )               ( *  *  *  *    )
+            ( *       )               ( *  *  *  *  * )
+            ( *  *    )
+            ( *  *  * )
+
+      direct = 'B', uplo = 'U':
+
+        A = ( *  *  * )           A = (       *  *  * )
+            ( *  *  * )               (          *  * )
+            ( *  *  * )               (             * )
+            (    *  * )
+            (       * )
+
+*****************************************************************************/
+
+void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo,
+                        char diag, lapack_int m, lapack_int n,
+                        const lapack_complex_double *in, lapack_int ldin,
+                        lapack_complex_double *out, lapack_int ldout )
+{
+    lapack_logical colmaj, front, lower, unit;
+
+    if( in == NULL || out == NULL ) return ;
+
+    colmaj = ( matrix_layout == LAPACK_COL_MAJOR );
+    front  = LAPACKE_lsame( direct, 'f' );
+    lower  = LAPACKE_lsame( uplo, 'l' );
+    unit   = LAPACKE_lsame( diag, 'u' );
+
+    if( ( !colmaj && ( matrix_layout != LAPACK_ROW_MAJOR ) ) ||
+        ( !front  && !LAPACKE_lsame( direct, 'b' ) ) ||
+        ( !lower  && !LAPACKE_lsame( uplo, 'u' ) ) ||
+        ( !unit   && !LAPACKE_lsame( diag, 'n' ) ) ) {
+        /* Just exit if any of input parameters are wrong */
+        return;
+    }
+
+    /* Initial offsets and sizes of triangular and rectangular parts */
+    lapack_int tri_in_offset = 0;
+    lapack_int tri_out_offset = 0;
+    lapack_int tri_n = MIN(m,n);
+    lapack_int rect_in_offset = -1;
+    lapack_int rect_out_offset = -1;
+    lapack_int rect_m = ( m > n ) ? m - n : m;
+    lapack_int rect_n = ( n > m ) ? n - m : n;
+
+    /* Fix offsets depending on the shape of the matrix */
+    if( front ) {
+        if( lower && m > n ) {
+            rect_in_offset = tri_n * ( !colmaj ? ldin : 1 );
+            rect_out_offset = tri_n * ( colmaj ? ldout : 1 );
+        } else if( !lower && n > m ) {
+            rect_in_offset = tri_n * ( colmaj ? ldin : 1 );
+            rect_out_offset = tri_n * ( !colmaj ? ldout : 1 );
+        }
+    } else {
+        if( m > n ) {
+            tri_in_offset = rect_m * ( !colmaj ? ldin : 1 );
+            tri_out_offset = rect_m * ( colmaj ? ldout : 1 );
+            if( !lower ) {
+                rect_in_offset = 0;
+                rect_out_offset = 0;
+            }
+        } else if( n > m ) {
+            tri_in_offset = rect_n * ( colmaj ? ldin : 1 );
+            tri_out_offset = rect_n * ( !colmaj ? ldout : 1 );
+            if( lower ) {
+                rect_in_offset = 0;
+                rect_out_offset = 0;
+            }
+        }
+    }
+
+    /* Copy & transpose rectangular part */
+    if( rect_in_offset >= 0 && rect_out_offset >= 0 ) {
+        LAPACKE_zge_trans( matrix_layout, rect_m, rect_n,
+                           &in[rect_in_offset], ldin,
+                           &out[rect_out_offset], ldout );
+    }
+
+    /* Copy & transpose triangular part */
+    return LAPACKE_ztr_trans( matrix_layout, uplo, diag, tri_n,
+                              &in[tri_in_offset], ldin,
+                              &out[tri_out_offset], ldout );
+}

From 90d7451df55231b6ec656f3baf7a4e288697e7dc Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 15:10:00 +0100
Subject: [PATCH 075/154] Add NaN check functions for trapezoidal matrices
 (Reference-LAPACK PR738+742)

---
 cmake/lapacke.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index 340ea6d6c..c740eceb4 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -2481,6 +2481,8 @@ set(Utils_SRC
   lapacke_ctp_nancheck.c          lapacke_dtr_trans.c             lapacke_str_trans.c             lapacke_ztp_trans.c
   lapacke_ctp_trans.c             lapacke_lsame.c                 lapacke_xerbla.c                lapacke_ztr_nancheck.c
   lapacke_ctr_nancheck.c          lapacke_make_complex_double.c   lapacke_z_nancheck.c            lapacke_ztr_trans.c
+  lapacke_ctz_nancheck.c          lapacke_ctz_trans.c             lapacke_dtz_nancheck.c          lapacke_dtz_trans.c
+  lapacke_stz_nancheck.c          lapacke_stz_trans.c             lapacke_ztz_nancheck.c          lapacke_ztz_trans.c
 )
 
 set(LAPACKE_REL_SRC "")

From 645633e321e09fafe271844011810a0fabd4f154 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 17:48:02 +0100
Subject: [PATCH 076/154] Fix leading dimension check of eigen-/Schur vectors
 (Reference-LAPACK PR 665)

---
 lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c  | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_cgges_work.c  | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c  | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_dgges_work.c  | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c | 4 ++--
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c
index 081f5b129..af6a247ed 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgeev_work.c
@@ -61,12 +61,12 @@ lapack_int LAPACKE_cgeev_work( int matrix_layout, char jobvl, char jobvr,
             LAPACKE_xerbla( "LAPACKE_cgeev_work", info );
             return info;
         }
-        if( ldvl < n ) {
+        if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
             info = -9;
             LAPACKE_xerbla( "LAPACKE_cgeev_work", info );
             return info;
         }
-        if( ldvr < n ) {
+        if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
             info = -11;
             LAPACKE_xerbla( "LAPACKE_cgeev_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c
index 2257c64df..632ddd661 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgeevx_work.c
@@ -65,12 +65,12 @@ lapack_int LAPACKE_cgeevx_work( int matrix_layout, char balanc, char jobvl,
             LAPACKE_xerbla( "LAPACKE_cgeevx_work", info );
             return info;
         }
-        if( ldvl < n ) {
+        if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
             info = -11;
             LAPACKE_xerbla( "LAPACKE_cgeevx_work", info );
             return info;
         }
-        if( ldvr < n ) {
+        if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
             info = -13;
             LAPACKE_xerbla( "LAPACKE_cgeevx_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c
index ff74939a3..be0b8347f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgges_work.c
@@ -72,12 +72,12 @@ lapack_int LAPACKE_cgges_work( int matrix_layout, char jobvsl, char jobvsr,
             LAPACKE_xerbla( "LAPACKE_cgges_work", info );
             return info;
         }
-        if( ldvsl < n ) {
+        if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) {
             info = -15;
             LAPACKE_xerbla( "LAPACKE_cgges_work", info );
             return info;
         }
-        if( ldvsr < n ) {
+        if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) {
             info = -17;
             LAPACKE_xerbla( "LAPACKE_cgges_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c
index 7edb1fa9b..311fe6e0a 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cggesx_work.c
@@ -76,12 +76,12 @@ lapack_int LAPACKE_cggesx_work( int matrix_layout, char jobvsl, char jobvsr,
             LAPACKE_xerbla( "LAPACKE_cggesx_work", info );
             return info;
         }
-        if( ldvsl < n ) {
+        if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) {
             info = -16;
             LAPACKE_xerbla( "LAPACKE_cggesx_work", info );
             return info;
         }
-        if( ldvsr < n ) {
+        if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) {
             info = -18;
             LAPACKE_xerbla( "LAPACKE_cggesx_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c
index c4de72394..424f5d176 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgeev_work.c
@@ -59,12 +59,12 @@ lapack_int LAPACKE_dgeev_work( int matrix_layout, char jobvl, char jobvr,
             LAPACKE_xerbla( "LAPACKE_dgeev_work", info );
             return info;
         }
-        if( ldvl < n ) {
+        if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
             info = -10;
             LAPACKE_xerbla( "LAPACKE_dgeev_work", info );
             return info;
         }
-        if( ldvr < n ) {
+        if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_dgeev_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c
index 9efb49ed3..7f4c6881d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgeevx_work.c
@@ -63,12 +63,12 @@ lapack_int LAPACKE_dgeevx_work( int matrix_layout, char balanc, char jobvl,
             LAPACKE_xerbla( "LAPACKE_dgeevx_work", info );
             return info;
         }
-        if( ldvl < n ) {
+        if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_dgeevx_work", info );
             return info;
         }
-        if( ldvr < n ) {
+        if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
             info = -14;
             LAPACKE_xerbla( "LAPACKE_dgeevx_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c
index effa1b3f5..bc6bf47d9 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgges_work.c
@@ -70,12 +70,12 @@ lapack_int LAPACKE_dgges_work( int matrix_layout, char jobvsl, char jobvsr,
             LAPACKE_xerbla( "LAPACKE_dgges_work", info );
             return info;
         }
-        if( ldvsl < n ) {
+        if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) {
             info = -16;
             LAPACKE_xerbla( "LAPACKE_dgges_work", info );
             return info;
         }
-        if( ldvsr < n ) {
+        if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) {
             info = -18;
             LAPACKE_xerbla( "LAPACKE_dgges_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c
index ace40a32a..bde1321d7 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dggesx_work.c
@@ -73,12 +73,12 @@ lapack_int LAPACKE_dggesx_work( int matrix_layout, char jobvsl, char jobvsr,
             LAPACKE_xerbla( "LAPACKE_dggesx_work", info );
             return info;
         }
-        if( ldvsl < n ) {
+        if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) {
             info = -17;
             LAPACKE_xerbla( "LAPACKE_dggesx_work", info );
             return info;
         }
-        if( ldvsr < n ) {
+        if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) {
             info = -19;
             LAPACKE_xerbla( "LAPACKE_dggesx_work", info );
             return info;

From 2226a82f2e6e9d4891247941fe946b33415c5d0f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 17:50:49 +0100
Subject: [PATCH 077/154] Fix leading dimension check of eigen-/Schur vectors
 (Reference-LAPACK PR 665)

---
 lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c  | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_sgges_work.c  | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c  | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_zgges_work.c  | 4 ++--
 lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c | 4 ++--
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c
index 0f5a8e004..af6dbedf0 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgeev_work.c
@@ -59,12 +59,12 @@ lapack_int LAPACKE_sgeev_work( int matrix_layout, char jobvl, char jobvr,
             LAPACKE_xerbla( "LAPACKE_sgeev_work", info );
             return info;
         }
-        if( ldvl < n ) {
+        if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
             info = -10;
             LAPACKE_xerbla( "LAPACKE_sgeev_work", info );
             return info;
         }
-        if( ldvr < n ) {
+        if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_sgeev_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c
index d05ea16e9..67f4982bf 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgeevx_work.c
@@ -63,12 +63,12 @@ lapack_int LAPACKE_sgeevx_work( int matrix_layout, char balanc, char jobvl,
             LAPACKE_xerbla( "LAPACKE_sgeevx_work", info );
             return info;
         }
-        if( ldvl < n ) {
+        if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_sgeevx_work", info );
             return info;
         }
-        if( ldvr < n ) {
+        if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
             info = -14;
             LAPACKE_xerbla( "LAPACKE_sgeevx_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c
index a3b09de30..1bd3eacf4 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgges_work.c
@@ -70,12 +70,12 @@ lapack_int LAPACKE_sgges_work( int matrix_layout, char jobvsl, char jobvsr,
             LAPACKE_xerbla( "LAPACKE_sgges_work", info );
             return info;
         }
-        if( ldvsl < n ) {
+        if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) {
             info = -16;
             LAPACKE_xerbla( "LAPACKE_sgges_work", info );
             return info;
         }
-        if( ldvsr < n ) {
+        if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) {
             info = -18;
             LAPACKE_xerbla( "LAPACKE_sgges_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c
index d3927e525..b1fbe1902 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sggesx_work.c
@@ -73,12 +73,12 @@ lapack_int LAPACKE_sggesx_work( int matrix_layout, char jobvsl, char jobvsr,
             LAPACKE_xerbla( "LAPACKE_sggesx_work", info );
             return info;
         }
-        if( ldvsl < n ) {
+        if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) {
             info = -17;
             LAPACKE_xerbla( "LAPACKE_sggesx_work", info );
             return info;
         }
-        if( ldvsr < n ) {
+        if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) {
             info = -19;
             LAPACKE_xerbla( "LAPACKE_sggesx_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c
index 9393f825a..445b9dc1c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgeev_work.c
@@ -61,12 +61,12 @@ lapack_int LAPACKE_zgeev_work( int matrix_layout, char jobvl, char jobvr,
             LAPACKE_xerbla( "LAPACKE_zgeev_work", info );
             return info;
         }
-        if( ldvl < n ) {
+        if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
             info = -9;
             LAPACKE_xerbla( "LAPACKE_zgeev_work", info );
             return info;
         }
-        if( ldvr < n ) {
+        if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
             info = -11;
             LAPACKE_xerbla( "LAPACKE_zgeev_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c
index e34112c09..29dbf06f0 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgeevx_work.c
@@ -65,12 +65,12 @@ lapack_int LAPACKE_zgeevx_work( int matrix_layout, char balanc, char jobvl,
             LAPACKE_xerbla( "LAPACKE_zgeevx_work", info );
             return info;
         }
-        if( ldvl < n ) {
+        if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
             info = -11;
             LAPACKE_xerbla( "LAPACKE_zgeevx_work", info );
             return info;
         }
-        if( ldvr < n ) {
+        if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
             info = -13;
             LAPACKE_xerbla( "LAPACKE_zgeevx_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c
index 2694c6530..13e2455c6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgges_work.c
@@ -72,12 +72,12 @@ lapack_int LAPACKE_zgges_work( int matrix_layout, char jobvsl, char jobvsr,
             LAPACKE_xerbla( "LAPACKE_zgges_work", info );
             return info;
         }
-        if( ldvsl < n ) {
+        if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) {
             info = -15;
             LAPACKE_xerbla( "LAPACKE_zgges_work", info );
             return info;
         }
-        if( ldvsr < n ) {
+        if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) {
             info = -17;
             LAPACKE_xerbla( "LAPACKE_zgges_work", info );
             return info;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c b/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c
index f9f1ccee1..fe99949b7 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zggesx_work.c
@@ -76,12 +76,12 @@ lapack_int LAPACKE_zggesx_work( int matrix_layout, char jobvsl, char jobvsr,
             LAPACKE_xerbla( "LAPACKE_zggesx_work", info );
             return info;
         }
-        if( ldvsl < n ) {
+        if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) {
             info = -16;
             LAPACKE_xerbla( "LAPACKE_zggesx_work", info );
             return info;
         }
-        if( ldvsr < n ) {
+        if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) {
             info = -18;
             LAPACKE_xerbla( "LAPACKE_zggesx_work", info );
             return info;

From 0c2aa0bed7d51af06f2ef2ab24779722473bce9d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 20:29:08 +0100
Subject: [PATCH 078/154] Fix implicit conversions and unused variables
 (Reference-LAPACK PR 703)

---
 lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c | 1 -
 lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c | 1 -
 lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c | 1 -
 lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c | 1 -
 4 files changed, 4 deletions(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
index 8406635e9..05ff8d57f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
@@ -48,7 +48,6 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp,
     lapack_int lrwork = -1;
     float* rwork = NULL;
     float rwork_query;
-    lapack_int i;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 );
         return -1;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c
index 4e1b87681..4a0d427b3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c
@@ -48,7 +48,6 @@ lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp,
     lapack_int lrwork = -1;
     double* rwork = NULL;
     double rwork_query;
-    lapack_int i;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 );
         return -1;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c
index 0b6406dec..627d2406c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c
@@ -48,7 +48,6 @@ lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp,
     lapack_int lrwork = -1;
     float* rwork = NULL;
     float rwork_query;
-    lapack_int i;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 );
         return -1;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
index 528b94a47..1d318e571 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
@@ -48,7 +48,6 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp,
     lapack_int lrwork = -1;
     double* rwork = NULL;
     double rwork_query;
-    lapack_int i;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 );
         return -1;

From a485e4f5156ad08dad26cdc20960b379fbc6a919 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 20:30:06 +0100
Subject: [PATCH 079/154] Fix implicit conversions and unused variables
 (Reference-LAPACK PR 703)

---
 lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c b/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c
index 1c027f862..a174fcaf0 100644
--- a/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c
+++ b/lapack-netlib/LAPACKE/example/example_DGELS_rowmajor.c
@@ -49,11 +49,9 @@
 
    LAPACKE_dgels (row-major, high-level) Example Program Results
 
-  -- LAPACKE Example routine (version 3.7.0) --
+  -- LAPACKE Example routine --
   -- LAPACK is a software package provided by Univ. of Tennessee,    --
   -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-     December 2016
-
 */
 /* Calling DGELS using row-major layout */
 
@@ -66,8 +64,8 @@
 int main (int argc, const char * argv[])
 {
    /* Locals */
-   double A[5][3] = {1,1,1,2,3,4,3,5,2,4,2,5,5,4,3};
-   double b[5][2] = {-10,-3,12,14,14,12,16,16,18,16};
+   double A[5][3] = {{1,1,1},{2,3,4},{3,5,2},{4,2,5},{5,4,3}};
+   double b[5][2] = {{-10,-3},{12,14},{14,12},{16,16},{18,16}};
    lapack_int info,m,n,lda,ldb,nrhs;
 
    /* Initialization */

From c99d27ae451a8c3e1ad46f3ff2fc2661ccb94896 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 20:33:20 +0100
Subject: [PATCH 080/154] Fix implicit conversions and unused variables
 (Reference-LAPACK PR 703)

---
 lapack-netlib/TESTING/EIG/cdrvsg.f | 4 ++--
 lapack-netlib/TESTING/EIG/cget37.f | 2 +-
 lapack-netlib/TESTING/EIG/ddrvsg.f | 4 ++--
 lapack-netlib/TESTING/EIG/sdrvsg.f | 4 ++--
 lapack-netlib/TESTING/EIG/zdrvsg.f | 4 ++--
 lapack-netlib/TESTING/EIG/zget37.f | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/lapack-netlib/TESTING/EIG/cdrvsg.f b/lapack-netlib/TESTING/EIG/cdrvsg.f
index a93933a27..d15b39d01 100644
--- a/lapack-netlib/TESTING/EIG/cdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/cdrvsg.f
@@ -663,8 +663,8 @@
                IL = 1
                IU = N
             ELSE
-               IL = 1 + ( N-1 )*SLARND( 1, ISEED2 )
-               IU = 1 + ( N-1 )*SLARND( 1, ISEED2 )
+               IL = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) )
+               IU = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) )
                IF( IL.GT.IU ) THEN
                   ITEMP = IL
                   IL = IU
diff --git a/lapack-netlib/TESTING/EIG/cget37.f b/lapack-netlib/TESTING/EIG/cget37.f
index c2a6589f3..44d4580d6 100644
--- a/lapack-netlib/TESTING/EIG/cget37.f
+++ b/lapack-netlib/TESTING/EIG/cget37.f
@@ -265,7 +265,7 @@
   100       CONTINUE
             WSRT( KMIN ) = WSRT( I )
             WSRT( I ) = VMIN
-            VCMIN = WTMP( I )
+            VCMIN = REAL( WTMP( I ) )
             WTMP( I ) = W( KMIN )
             WTMP( KMIN ) = VCMIN
             VMIN = STMP( KMIN )
diff --git a/lapack-netlib/TESTING/EIG/ddrvsg.f b/lapack-netlib/TESTING/EIG/ddrvsg.f
index 0b49c8404..2e9d3c643 100644
--- a/lapack-netlib/TESTING/EIG/ddrvsg.f
+++ b/lapack-netlib/TESTING/EIG/ddrvsg.f
@@ -645,8 +645,8 @@
                IL = 1
                IU = N
             ELSE
-               IL = 1 + ( N-1 )*DLARND( 1, ISEED2 )
-               IU = 1 + ( N-1 )*DLARND( 1, ISEED2 )
+               IL = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) )
+               IU = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) )
                IF( IL.GT.IU ) THEN
                   ITEMP = IL
                   IL = IU
diff --git a/lapack-netlib/TESTING/EIG/sdrvsg.f b/lapack-netlib/TESTING/EIG/sdrvsg.f
index 4a57223c8..877579bcd 100644
--- a/lapack-netlib/TESTING/EIG/sdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/sdrvsg.f
@@ -645,8 +645,8 @@
                IL = 1
                IU = N
             ELSE
-               IL = 1 + ( N-1 )*SLARND( 1, ISEED2 )
-               IU = 1 + ( N-1 )*SLARND( 1, ISEED2 )
+               IL = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) )
+               IU = 1 + INT( ( N-1 )*SLARND( 1, ISEED2 ) )
                IF( IL.GT.IU ) THEN
                   ITEMP = IL
                   IL = IU
diff --git a/lapack-netlib/TESTING/EIG/zdrvsg.f b/lapack-netlib/TESTING/EIG/zdrvsg.f
index 336514a3f..71f1d6371 100644
--- a/lapack-netlib/TESTING/EIG/zdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/zdrvsg.f
@@ -663,8 +663,8 @@
                IL = 1
                IU = N
             ELSE
-               IL = 1 + ( N-1 )*DLARND( 1, ISEED2 )
-               IU = 1 + ( N-1 )*DLARND( 1, ISEED2 )
+               IL = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) )
+               IU = 1 + INT( ( N-1 )*DLARND( 1, ISEED2 ) )
                IF( IL.GT.IU ) THEN
                   ITEMP = IL
                   IL = IU
diff --git a/lapack-netlib/TESTING/EIG/zget37.f b/lapack-netlib/TESTING/EIG/zget37.f
index 63680e855..5013fbdd9 100644
--- a/lapack-netlib/TESTING/EIG/zget37.f
+++ b/lapack-netlib/TESTING/EIG/zget37.f
@@ -265,7 +265,7 @@
   100       CONTINUE
             WSRT( KMIN ) = WSRT( I )
             WSRT( I ) = VMIN
-            VCMIN = WTMP( I )
+            VCMIN = DBLE( WTMP( I ) )
             WTMP( I ) = W( KMIN )
             WTMP( KMIN ) = VCMIN
             VMIN = STMP( KMIN )

From fdb012ceed9ec69d900f9c5117e7be4263b6a947 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 20:37:18 +0100
Subject: [PATCH 081/154] Fix implicit conversions and unused variables
 (Reference-LAPACK PR 703)

---
 lapack-netlib/TESTING/LIN/cchkpt.f |  6 +--
 lapack-netlib/TESTING/LIN/cchktr.f | 54 ++++++++++++++++++++++-----
 lapack-netlib/TESTING/LIN/cdrvgt.f |  8 ++--
 lapack-netlib/TESTING/LIN/clattp.f |  6 +--
 lapack-netlib/TESTING/LIN/cpbt01.f |  5 ++-
 lapack-netlib/TESTING/LIN/cpot01.f |  4 +-
 lapack-netlib/TESTING/LIN/cppt01.f |  2 +-
 lapack-netlib/TESTING/LIN/cpst01.f |  2 +-
 lapack-netlib/TESTING/LIN/zchkpt.f |  6 +--
 lapack-netlib/TESTING/LIN/zchktr.f | 60 +++++++++++++++++++++++-------
 lapack-netlib/TESTING/LIN/zdrvgt.f |  8 ++--
 lapack-netlib/TESTING/LIN/zdrvpt.f | 12 +++---
 lapack-netlib/TESTING/LIN/zlattp.f |  6 +--
 lapack-netlib/TESTING/LIN/zpbt01.f |  5 ++-
 lapack-netlib/TESTING/LIN/zpot01.f |  4 +-
 lapack-netlib/TESTING/LIN/zppt01.f |  2 +-
 lapack-netlib/TESTING/LIN/zpst01.f |  2 +-
 17 files changed, 131 insertions(+), 61 deletions(-)

diff --git a/lapack-netlib/TESTING/LIN/cchkpt.f b/lapack-netlib/TESTING/LIN/cchkpt.f
index 2ec802064..7dc367eeb 100644
--- a/lapack-netlib/TESTING/LIN/cchkpt.f
+++ b/lapack-netlib/TESTING/LIN/cchkpt.f
@@ -319,15 +319,15 @@
 *                 elements.
 *
                   IF( IZERO.EQ.1 ) THEN
-                     D( 1 ) = Z( 2 )
+                     D( 1 ) = REAL( Z( 2 ) )
                      IF( N.GT.1 )
      $                  E( 1 ) = Z( 3 )
                   ELSE IF( IZERO.EQ.N ) THEN
                      E( N-1 ) = Z( 1 )
-                     D( N ) = Z( 2 )
+                     D( N ) = REAL( Z( 2 ) )
                   ELSE
                      E( IZERO-1 ) = Z( 1 )
-                     D( IZERO ) = Z( 2 )
+                     D( IZERO ) = REAL( Z( 2 ) )
                      E( IZERO ) = Z( 3 )
                   END IF
                END IF
diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f
index ce1ecf761..c55b07643 100644
--- a/lapack-netlib/TESTING/LIN/cchktr.f
+++ b/lapack-netlib/TESTING/LIN/cchktr.f
@@ -31,7 +31,7 @@
 *>
 *> \verbatim
 *>
-*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS
+*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS(3)
 *> \endverbatim
 *
 *  Arguments:
@@ -184,7 +184,7 @@
       INTEGER            NTYPE1, NTYPES
       PARAMETER          ( NTYPE1 = 10, NTYPES = 18 )
       INTEGER            NTESTS
-      PARAMETER          ( NTESTS = 9 )
+      PARAMETER          ( NTESTS = 10 )
       INTEGER            NTRAN
       PARAMETER          ( NTRAN = 3 )
       REAL               ONE, ZERO
@@ -195,13 +195,13 @@
       CHARACTER*3        PATH
       INTEGER            I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN,
      $                   IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN
-      REAL               AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI,
-     $                   RCONDO, SCALE
+      REAL               AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC,
+     $                   RCONDI, RCONDO, RES, SCALE, SLAMCH
 *     ..
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      REAL               RESULT( NTESTS )
+      REAL               RESULT( NTESTS ), SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -210,9 +210,9 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAERH, ALAHD, ALASUM, CCOPY, CERRTR, CGET04,
-     $                   CLACPY, CLARHS, CLATRS, CLATTR, CTRCON, CTRRFS,
-     $                   CTRT01, CTRT02, CTRT03, CTRT05, CTRT06, CTRTRI,
-     $                   CTRTRS, XLAENV
+     $                   CLACPY, CLARHS, CLATRS, CLATRS3, CLATTR,
+     $                   CSSCAL, CTRCON, CTRRFS, CTRT01, CTRT02, CTRT03,
+     $                   CTRT05, CTRT06, CTRTRI, CTRTRS, XLAENV, SLAMCH
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -236,6 +236,7 @@
 *
       PATH( 1: 1 ) = 'Complex precision'
       PATH( 2: 3 ) = 'TR'
+      BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision')
       NRUN = 0
       NFAIL = 0
       NERRS = 0
@@ -380,7 +381,7 @@
 *                       This line is needed on a Sun SPARCstation.
 *
                         IF( N.GT.0 )
-     $                     DUMMY = A( 1 )
+     $                     DUMMY = REAL( A( 1 ) )
 *
                         CALL CTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA,
      $                               X, LDA, B, LDA, WORK, RWORK,
@@ -535,6 +536,32 @@
      $                         RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK,
      $                         RESULT( 9 ) )
 *
+*+    TEST 10
+*                 Solve op(A)*X = B.
+*
+                  SRNAMT = 'CLATRS3'
+                  CALL CCOPY( N, X, 1, B, 1 )
+                  CALL CCOPY( N, X, 1, B, 1 )
+                  CALL CSCAL( N, BIGNUM, B( N+1 ), 1 )
+                  CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
+     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
+     $                          INFO )
+*
+*                 Check error code from CLATRS3.
+*
+                  IF( INFO.NE.0 )
+     $               CALL ALAERH( PATH, 'CLATRS3', INFO, 0,
+     $                            UPLO // TRANS // DIAG // 'Y', N, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS, NOUT )
+                  CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA,
+     $                         X, LDA, WORK, RESULT( 10 ) )
+                  CALL CSSCAL( N, BIGNUM, X, 1 )
+                  CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA,
+     $                         X, LDA, WORK, RESULT( 10 ) )
+                  RESULT( 10 ) = MAX( RESULT( 10 ), RES )
+*
 *                 Print information about the tests that did not pass
 *                 the threshold.
 *
@@ -552,7 +579,14 @@
      $                  DIAG, 'Y', N, IMAT, 9, RESULT( 9 )
                      NFAIL = NFAIL + 1
                   END IF
-                  NRUN = NRUN + 2
+                  IF( RESULT( 10 ).GE.THRESH ) THEN
+                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                  CALL ALAHD( NOUT, PATH )
+                     WRITE( NOUT, FMT = 9996 )'CLATRS3', UPLO, TRANS,
+     $                  DIAG, 'N', N, IMAT, 10, RESULT( 10 )
+                     NFAIL = NFAIL + 1
+                  END IF
+                  NRUN = NRUN + 3
    90          CONTINUE
   100       CONTINUE
   110    CONTINUE
diff --git a/lapack-netlib/TESTING/LIN/cdrvgt.f b/lapack-netlib/TESTING/LIN/cdrvgt.f
index 8d43f640f..acfbbcfa1 100644
--- a/lapack-netlib/TESTING/LIN/cdrvgt.f
+++ b/lapack-netlib/TESTING/LIN/cdrvgt.f
@@ -307,16 +307,16 @@
                   IZERO = 0
                ELSE IF( IMAT.EQ.8 ) THEN
                   IZERO = 1
-                  Z( 2 ) = A( N )
+                  Z( 2 ) = REAL( A( N ) )
                   A( N ) = ZERO
                   IF( N.GT.1 ) THEN
-                     Z( 3 ) = A( 1 )
+                     Z( 3 ) = REAL( A( 1 ) )
                      A( 1 ) = ZERO
                   END IF
                ELSE IF( IMAT.EQ.9 ) THEN
                   IZERO = N
-                  Z( 1 ) = A( 3*N-2 )
-                  Z( 2 ) = A( 2*N-1 )
+                  Z( 1 ) = REAL( A( 3*N-2 ) )
+                  Z( 2 ) = REAL( A( 2*N-1 ) )
                   A( 3*N-2 ) = ZERO
                   A( 2*N-1 ) = ZERO
                ELSE
diff --git a/lapack-netlib/TESTING/LIN/clattp.f b/lapack-netlib/TESTING/LIN/clattp.f
index 82f0585df..a47a252ad 100644
--- a/lapack-netlib/TESTING/LIN/clattp.f
+++ b/lapack-netlib/TESTING/LIN/clattp.f
@@ -336,7 +336,7 @@
                WORK( J+1 ) = PLUS2
                WORK( N+J+1 ) = ZERO
                PLUS1 = STAR1 / PLUS2
-               REXP = CLARND( 2, ISEED )
+               REXP = REAL( CLARND( 2, ISEED ) )
                IF( REXP.LT.ZERO ) THEN
                   STAR1 = -SFAC**( ONE-REXP )*CLARND( 5, ISEED )
                ELSE
@@ -790,7 +790,7 @@
             DO 460 J = 1, N / 2
                JL = JJ
                DO 450 I = J, N - J
-                  T = AP( JR-I+J )
+                  T = REAL( AP( JR-I+J ) )
                   AP( JR-I+J ) = AP( JL )
                   AP( JL ) = T
                   JL = JL + I
@@ -804,7 +804,7 @@
             DO 480 J = 1, N / 2
                JR = JJ
                DO 470 I = J, N - J
-                  T = AP( JL+I-J )
+                  T = REAL( AP( JL+I-J ) )
                   AP( JL+I-J ) = AP( JR )
                   AP( JR ) = T
                   JR = JR - I
diff --git a/lapack-netlib/TESTING/LIN/cpbt01.f b/lapack-netlib/TESTING/LIN/cpbt01.f
index 33c80666d..6145a1875 100644
--- a/lapack-netlib/TESTING/LIN/cpbt01.f
+++ b/lapack-netlib/TESTING/LIN/cpbt01.f
@@ -201,7 +201,8 @@
 *
 *           Compute the (K,K) element of the result.
 *
-            AKK = CDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 )
+            AKK = REAL(
+     $         CDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) )
             AFAC( KD+1, K ) = AKK
 *
 *           Compute the rest of column K.
@@ -228,7 +229,7 @@
 *
 *           Scale column K by the diagonal element.
 *
-            AKK = AFAC( 1, K )
+            AKK = REAL( AFAC( 1, K ) )
             CALL CSSCAL( KLEN+1, AKK, AFAC( 1, K ), 1 )
 *
    40    CONTINUE
diff --git a/lapack-netlib/TESTING/LIN/cpot01.f b/lapack-netlib/TESTING/LIN/cpot01.f
index 00e195dd6..fbcf65086 100644
--- a/lapack-netlib/TESTING/LIN/cpot01.f
+++ b/lapack-netlib/TESTING/LIN/cpot01.f
@@ -176,7 +176,7 @@
 *
 *           Compute the (K,K) element of the result.
 *
-            TR = CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 )
+            TR = REAL( CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) )
             AFAC( K, K ) = TR
 *
 *           Compute the rest of column K.
@@ -224,7 +224,7 @@
    70    CONTINUE
       END IF
 *
-*     Compute norm( L*U - A ) / ( N * norm(A) * EPS )
+*     Compute norm(L*U - A) / ( N * norm(A) * EPS )
 *
       RESID = CLANHE( '1', UPLO, N, AFAC, LDAFAC, RWORK )
 *
diff --git a/lapack-netlib/TESTING/LIN/cppt01.f b/lapack-netlib/TESTING/LIN/cppt01.f
index 3a761a4c7..f865ec779 100644
--- a/lapack-netlib/TESTING/LIN/cppt01.f
+++ b/lapack-netlib/TESTING/LIN/cppt01.f
@@ -178,7 +178,7 @@
 *
 *           Compute the (K,K) element of the result.
 *
-            TR = CDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 )
+            TR = REAL( CDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) )
             AFAC( KC+K-1 ) = TR
 *
 *           Compute the rest of column K.
diff --git a/lapack-netlib/TESTING/LIN/cpst01.f b/lapack-netlib/TESTING/LIN/cpst01.f
index 26da4b394..03d25515d 100644
--- a/lapack-netlib/TESTING/LIN/cpst01.f
+++ b/lapack-netlib/TESTING/LIN/cpst01.f
@@ -219,7 +219,7 @@
 *
 *           Compute the (K,K) element of the result.
 *
-            TR = CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 )
+            TR = REAL( CDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) )
             AFAC( K, K ) = TR
 *
 *           Compute the rest of column K.
diff --git a/lapack-netlib/TESTING/LIN/zchkpt.f b/lapack-netlib/TESTING/LIN/zchkpt.f
index 80e1690a7..11089d2a1 100644
--- a/lapack-netlib/TESTING/LIN/zchkpt.f
+++ b/lapack-netlib/TESTING/LIN/zchkpt.f
@@ -319,15 +319,15 @@
 *                 elements.
 *
                   IF( IZERO.EQ.1 ) THEN
-                     D( 1 ) = Z( 2 )
+                     D( 1 ) = DBLE( Z( 2 ) )
                      IF( N.GT.1 )
      $                  E( 1 ) = Z( 3 )
                   ELSE IF( IZERO.EQ.N ) THEN
                      E( N-1 ) = Z( 1 )
-                     D( N ) = Z( 2 )
+                     D( N ) = DBLE( Z( 2 ) )
                   ELSE
                      E( IZERO-1 ) = Z( 1 )
-                     D( IZERO ) = Z( 2 )
+                     D( IZERO ) = DBLE( Z( 2 ) )
                      E( IZERO ) = Z( 3 )
                   END IF
                END IF
diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f
index 0a6f47b1e..275ca2857 100644
--- a/lapack-netlib/TESTING/LIN/zchktr.f
+++ b/lapack-netlib/TESTING/LIN/zchktr.f
@@ -31,7 +31,7 @@
 *>
 *> \verbatim
 *>
-*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS
+*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS(3)
 *> \endverbatim
 *
 *  Arguments:
@@ -184,7 +184,7 @@
       INTEGER            NTYPE1, NTYPES
       PARAMETER          ( NTYPE1 = 10, NTYPES = 18 )
       INTEGER            NTESTS
-      PARAMETER          ( NTESTS = 9 )
+      PARAMETER          ( NTESTS = 10 )
       INTEGER            NTRAN
       PARAMETER          ( NTRAN = 3 )
       DOUBLE PRECISION   ONE, ZERO
@@ -195,13 +195,13 @@
       CHARACTER*3        PATH
       INTEGER            I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN,
      $                   IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN
-      DOUBLE PRECISION   AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI,
-     $                   RCONDO, SCALE
+      DOUBLE PRECISION   AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC,
+     $                   RCONDI, RCONDO, RES, SCALE, DLAMCH
 *     ..
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      DOUBLE PRECISION   RESULT( NTESTS )
+      DOUBLE PRECISION   RESULT( NTESTS ), SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -209,10 +209,10 @@
       EXTERNAL           LSAME, ZLANTR
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, XLAENV, ZCOPY, ZERRTR,
-     $                   ZGET04, ZLACPY, ZLARHS, ZLATRS, ZLATTR, ZTRCON,
-     $                   ZTRRFS, ZTRT01, ZTRT02, ZTRT03, ZTRT05, ZTRT06,
-     $                   ZTRTRI, ZTRTRS
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DLAMCH, XLAENV, ZCOPY,
+     $                   ZDSCAL, ZERRTR, ZGET04, ZLACPY, ZLARHS, ZLATRS,
+     $                   ZLATRS3, ZLATTR, ZTRCON, ZTRRFS, ZTRT01,
+     $                   ZTRT02, ZTRT03, ZTRT05, ZTRT06, ZTRTRI, ZTRTRS
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -236,6 +236,7 @@
 *
       PATH( 1: 1 ) = 'Zomplex precision'
       PATH( 2: 3 ) = 'TR'
+      BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision')
       NRUN = 0
       NFAIL = 0
       NERRS = 0
@@ -380,7 +381,7 @@
 *                       This line is needed on a Sun SPARCstation.
 *
                         IF( N.GT.0 )
-     $                     DUMMY = A( 1 )
+     $                     DUMMY = DBLE( A( 1 ) )
 *
                         CALL ZTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA,
      $                               X, LDA, B, LDA, WORK, RWORK,
@@ -535,6 +536,32 @@
      $                         RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK,
      $                         RESULT( 9 ) )
 *
+*+    TEST 10
+*                 Solve op(A)*X = B
+*
+                  SRNAMT = 'ZLATRS3'
+                  CALL ZCOPY( N, X, 1, B, 1 )
+                  CALL ZCOPY( N, X, 1, B( N+1 ), 1 )
+                  CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 )
+                  CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
+     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
+     $                          INFO )
+*
+*                 Check error code from ZLATRS3.
+*
+                  IF( INFO.NE.0 )
+     $               CALL ALAERH( PATH, 'ZLATRS3', INFO, 0,
+     $                            UPLO // TRANS // DIAG // 'N', N, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS, NOUT )
+                  CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA,
+     $                         X, LDA, WORK, RESULT( 10 ) )
+                  CALL ZDSCAL( N, BIGNUM, X, 1 )
+                  CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA,
+     $                         X, LDA, WORK, RES )
+                  RESULT( 10 ) = MAX( RESULT( 10 ), RES )
+*
 *                 Print information about the tests that did not pass
 *                 the threshold.
 *
@@ -552,7 +579,14 @@
      $                  DIAG, 'Y', N, IMAT, 9, RESULT( 9 )
                      NFAIL = NFAIL + 1
                   END IF
-                  NRUN = NRUN + 2
+                  IF( RESULT( 10 ).GE.THRESH ) THEN
+                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                  CALL ALAHD( NOUT, PATH )
+                     WRITE( NOUT, FMT = 9996 )'ZLATRS3', UPLO, TRANS,
+     $                  DIAG, 'N', N, IMAT, 10, RESULT( 10 )
+                     NFAIL = NFAIL + 1
+                  END IF
+                  NRUN = NRUN + 3
    90          CONTINUE
   100       CONTINUE
   110    CONTINUE
@@ -565,8 +599,8 @@
  9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=',
      $      I4, ', type ', I2, ', test(', I2, ')= ', G12.5 )
  9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1,
-     $      ''', N=', I5, ', NB=', I4, ', type ', I2, ',
-     $      test(', I2, ')= ', G12.5 )
+     $      ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(',
+     $      I2, ')= ', G12.5 )
  9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',',
      $      11X, ' type ', I2, ', test(', I2, ')=', G12.5 )
  9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''',
diff --git a/lapack-netlib/TESTING/LIN/zdrvgt.f b/lapack-netlib/TESTING/LIN/zdrvgt.f
index d055e4bdb..b2e0f66b1 100644
--- a/lapack-netlib/TESTING/LIN/zdrvgt.f
+++ b/lapack-netlib/TESTING/LIN/zdrvgt.f
@@ -307,16 +307,16 @@
                   IZERO = 0
                ELSE IF( IMAT.EQ.8 ) THEN
                   IZERO = 1
-                  Z( 2 ) = A( N )
+                  Z( 2 ) = DBLE( A( N ) )
                   A( N ) = ZERO
                   IF( N.GT.1 ) THEN
-                     Z( 3 ) = A( 1 )
+                     Z( 3 ) = DBLE( A( 1 ) )
                      A( 1 ) = ZERO
                   END IF
                ELSE IF( IMAT.EQ.9 ) THEN
                   IZERO = N
-                  Z( 1 ) = A( 3*N-2 )
-                  Z( 2 ) = A( 2*N-1 )
+                  Z( 1 ) = DBLE( A( 3*N-2 ) )
+                  Z( 2 ) = DBLE( A( 2*N-1 ) )
                   A( 3*N-2 ) = ZERO
                   A( 2*N-1 ) = ZERO
                ELSE
diff --git a/lapack-netlib/TESTING/LIN/zdrvpt.f b/lapack-netlib/TESTING/LIN/zdrvpt.f
index 14a9f76ba..75f4d5738 100644
--- a/lapack-netlib/TESTING/LIN/zdrvpt.f
+++ b/lapack-netlib/TESTING/LIN/zdrvpt.f
@@ -266,12 +266,12 @@
 *
                IA = 1
                DO 20 I = 1, N - 1
-                  D( I ) = A( IA )
+                  D( I ) = DBLE( A( IA ) )
                   E( I ) = A( IA+1 )
                   IA = IA + 2
    20          CONTINUE
                IF( N.GT.0 )
-     $            D( N ) = A( IA )
+     $            D( N ) = DBLE( A( IA ) )
             ELSE
 *
 *              Type 7-12:  generate a diagonally dominant matrix with
@@ -333,13 +333,13 @@
                   Z( 2 ) = D( 1 )
                   D( 1 ) = ZERO
                   IF( N.GT.1 ) THEN
-                     Z( 3 ) = E( 1 )
+                     Z( 3 ) = DBLE( E( 1 ) )
                      E( 1 ) = ZERO
                   END IF
                ELSE IF( IMAT.EQ.9 ) THEN
                   IZERO = N
                   IF( N.GT.1 ) THEN
-                     Z( 1 ) = E( N-1 )
+                     Z( 1 ) = DBLE( E( N-1 ) )
                      E( N-1 ) = ZERO
                   END IF
                   Z( 2 ) = D( N )
@@ -347,9 +347,9 @@
                ELSE IF( IMAT.EQ.10 ) THEN
                   IZERO = ( N+1 ) / 2
                   IF( IZERO.GT.1 ) THEN
-                     Z( 1 ) = E( IZERO-1 )
+                     Z( 1 ) = DBLE( E( IZERO-1 ) )
                      E( IZERO-1 ) = ZERO
-                     Z( 3 ) = E( IZERO )
+                     Z( 3 ) = DBLE( E( IZERO ) )
                      E( IZERO ) = ZERO
                   END IF
                   Z( 2 ) = D( IZERO )
diff --git a/lapack-netlib/TESTING/LIN/zlattp.f b/lapack-netlib/TESTING/LIN/zlattp.f
index b728852b5..e05d9299e 100644
--- a/lapack-netlib/TESTING/LIN/zlattp.f
+++ b/lapack-netlib/TESTING/LIN/zlattp.f
@@ -336,7 +336,7 @@
                WORK( J+1 ) = PLUS2
                WORK( N+J+1 ) = ZERO
                PLUS1 = STAR1 / PLUS2
-               REXP = ZLARND( 2, ISEED )
+               REXP = DBLE( ZLARND( 2, ISEED ) )
                IF( REXP.LT.ZERO ) THEN
                   STAR1 = -SFAC**( ONE-REXP )*ZLARND( 5, ISEED )
                ELSE
@@ -790,7 +790,7 @@
             DO 460 J = 1, N / 2
                JL = JJ
                DO 450 I = J, N - J
-                  T = AP( JR-I+J )
+                  T = DBLE( AP( JR-I+J ) )
                   AP( JR-I+J ) = AP( JL )
                   AP( JL ) = T
                   JL = JL + I
@@ -804,7 +804,7 @@
             DO 480 J = 1, N / 2
                JR = JJ
                DO 470 I = J, N - J
-                  T = AP( JL+I-J )
+                  T = DBLE( AP( JL+I-J ) )
                   AP( JL+I-J ) = AP( JR )
                   AP( JR ) = T
                   JR = JR - I
diff --git a/lapack-netlib/TESTING/LIN/zpbt01.f b/lapack-netlib/TESTING/LIN/zpbt01.f
index fb7881ac7..1801b66cf 100644
--- a/lapack-netlib/TESTING/LIN/zpbt01.f
+++ b/lapack-netlib/TESTING/LIN/zpbt01.f
@@ -201,7 +201,8 @@
 *
 *           Compute the (K,K) element of the result.
 *
-            AKK = ZDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 )
+            AKK = DBLE(
+     $         ZDOTC( KLEN+1, AFAC( KC, K ), 1, AFAC( KC, K ), 1 ) )
             AFAC( KD+1, K ) = AKK
 *
 *           Compute the rest of column K.
@@ -228,7 +229,7 @@
 *
 *           Scale column K by the diagonal element.
 *
-            AKK = AFAC( 1, K )
+            AKK = DBLE( AFAC( 1, K ) )
             CALL ZDSCAL( KLEN+1, AKK, AFAC( 1, K ), 1 )
 *
    40    CONTINUE
diff --git a/lapack-netlib/TESTING/LIN/zpot01.f b/lapack-netlib/TESTING/LIN/zpot01.f
index d71445cd4..de83414c6 100644
--- a/lapack-netlib/TESTING/LIN/zpot01.f
+++ b/lapack-netlib/TESTING/LIN/zpot01.f
@@ -176,7 +176,7 @@
 *
 *           Compute the (K,K) element of the result.
 *
-            TR = ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 )
+            TR = DBLE( ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) )
             AFAC( K, K ) = TR
 *
 *           Compute the rest of column K.
@@ -224,7 +224,7 @@
    70    CONTINUE
       END IF
 *
-*     Compute norm( L*U - A ) / ( N * norm(A) * EPS )
+*     Compute norm(L*U - A) / ( N * norm(A) * EPS )
 *
       RESID = ZLANHE( '1', UPLO, N, AFAC, LDAFAC, RWORK )
 *
diff --git a/lapack-netlib/TESTING/LIN/zppt01.f b/lapack-netlib/TESTING/LIN/zppt01.f
index 78ec595af..acaea50d2 100644
--- a/lapack-netlib/TESTING/LIN/zppt01.f
+++ b/lapack-netlib/TESTING/LIN/zppt01.f
@@ -178,7 +178,7 @@
 *
 *           Compute the (K,K) element of the result.
 *
-            TR = ZDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 )
+            TR = DBLE( ZDOTC( K, AFAC( KC ), 1, AFAC( KC ), 1 ) )
             AFAC( KC+K-1 ) = TR
 *
 *           Compute the rest of column K.
diff --git a/lapack-netlib/TESTING/LIN/zpst01.f b/lapack-netlib/TESTING/LIN/zpst01.f
index 691857219..bed18c514 100644
--- a/lapack-netlib/TESTING/LIN/zpst01.f
+++ b/lapack-netlib/TESTING/LIN/zpst01.f
@@ -219,7 +219,7 @@
 *
 *           Compute the (K,K) element of the result.
 *
-            TR = ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 )
+            TR = DBLE( ZDOTC( K, AFAC( 1, K ), 1, AFAC( 1, K ), 1 ) )
             AFAC( K, K ) = TR
 *
 *           Compute the rest of column K.

From 8b3f9715ec122dd40a8a2638b92757ac7d8ff7f5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 22:48:37 +0100
Subject: [PATCH 082/154] Add macros for 32/64bit integer printf

---
 lapack-netlib/LAPACKE/include/lapack.h        | 100 +++++++++++++++++-
 .../LAPACKE/include/lapacke_config.h          |  18 +++-
 2 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index 14695fdc8..b5a276f5a 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -12,6 +12,7 @@
 
 #include <stdlib.h>
 #include <stdarg.h>
+#include <inttypes.h>
 
 /* It seems all current Fortran compilers put strlen at end.
 *  Some historical compilers put strlen after the str argument
@@ -80,11 +81,26 @@ extern "C" {
 
 /*----------------------------------------------------------------------------*/
 #ifndef lapack_int
-#define lapack_int     int
+#if defined(LAPACK_ILP64)
+#define lapack_int        int64_t
+#else
+#define lapack_int        int32_t
+#endif
+#endif
+
+/*
+ * Integer format string
+ */
+#ifndef LAPACK_IFMT
+#if defined(LAPACK_ILP64)
+#define LAPACK_IFMT       PRId64
+#else
+#define LAPACK_IFMT       PRId32
+#endif
 #endif
 
 #ifndef lapack_logical
-#define lapack_logical lapack_int
+#define lapack_logical    lapack_int
 #endif
 
 /* f2c, hence clapack and MacOS Accelerate, returns double instead of float
@@ -115,7 +131,7 @@ typedef lapack_logical (*LAPACK_Z_SELECT2)
     ( const lapack_complex_double*, const lapack_complex_double* );
 
 #define LAPACK_lsame_base LAPACK_GLOBAL(lsame,LSAME)
-lapack_logical LAPACK_lsame_base( const char* ca, const char* cb,
+lapack_logical LAPACK_lsame_base( const char* ca,  const char* cb,
                               lapack_int lca, lapack_int lcb
 #ifdef LAPACK_FORTRAN_STRLEN_END
     , size_t, size_t
@@ -21986,6 +22002,84 @@ void LAPACK_ztrsyl_base(
     #define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__)
 #endif
 
+#define LAPACK_ctrsyl3_base LAPACK_GLOBAL(ctrsyl3,CTRSYL3)
+void LAPACK_ctrsyl3_base(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* C, lapack_int const* ldc, float* scale,
+    float* swork, lapack_int const *ldswork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__, 1, 1)
+#else
+    #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3)
+void LAPACK_dtrsyl3_base(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double const* B, lapack_int const* ldb,
+    double* C, lapack_int const* ldc, double* scale,
+    lapack_int* iwork, lapack_int const* liwork,
+    double* swork, lapack_int const *ldswork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__, 1, 1)
+#else
+    #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_strsyl3_base LAPACK_GLOBAL(strsyl3,STRSYL3)
+void LAPACK_strsyl3_base(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float const* B, lapack_int const* ldb,
+    float* C, lapack_int const* ldc, float* scale,
+    lapack_int* iwork, lapack_int const* liwork,
+    float* swork, lapack_int const *ldswork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__, 1, 1)
+#else
+    #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_ztrsyl3_base LAPACK_GLOBAL(ztrsyl3,ZTRSYL3)
+void LAPACK_ztrsyl3_base(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* C, lapack_int const* ldc, double* scale,
+    double* swork, lapack_int const *ldswork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__, 1, 1)
+#else
+    #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__)
+#endif
+
 #define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI)
 void LAPACK_ctrtri_base(
     char const* uplo, char const* diag,
diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h
index 4a7d15760..c64fc4416 100644
--- a/lapack-netlib/LAPACKE/include/lapacke_config.h
+++ b/lapack-netlib/LAPACKE/include/lapacke_config.h
@@ -42,17 +42,29 @@ extern "C" {
 
 #include <stdlib.h>
 #include <stdint.h>
+#include <inttypes.h>
 
 #ifndef lapack_int
 #if defined(LAPACK_ILP64)
-#define lapack_int              int64_t
+#define lapack_int        int64_t
 #else
-#define lapack_int              int32_t
+#define lapack_int        int32_t
+#endif
+#endif
+
+/*
+ * Integer format string
+ */
+#ifndef LAPACK_IFMT
+#if defined(LAPACK_ILP64)
+#define LAPACK_IFMT       PRId64
+#else
+#define LAPACK_IFMT       PRId32
 #endif
 #endif
 
 #ifndef lapack_logical
-#define lapack_logical          lapack_int
+#define lapack_logical    lapack_int
 #endif
 
 #ifndef LAPACK_COMPLEX_CUSTOM

From 29dc086f38eb0220f301b2f6d9a3dc85e9346dbf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 22:50:02 +0100
Subject: [PATCH 083/154] Add macros for 32/64bit integer printf

---
 lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c | 6 ++----
 lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c | 6 ++----
 lapack-netlib/LAPACKE/example/lapacke_example_aux.c    | 2 +-
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c b/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c
index c8bdd6e4e..44a470d47 100644
--- a/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c
+++ b/lapack-netlib/LAPACKE/example/example_DGESV_colmajor.c
@@ -25,11 +25,9 @@
 
    LAPACKE_dgesv (col-major, high-level) Example Program Results
 
-  -- LAPACKE Example routine (version 3.7.0) --
+  -- LAPACKE Example routine --
   -- LAPACK is a software package provided by Univ. of Tennessee,    --
   -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-     December 2016
-
 */
 /* Includes */
 #include <stdlib.h>
@@ -94,7 +92,7 @@ int main(int argc, char **argv) {
         /* Check for the exact singularity */
         if( info > 0 ) {
                 printf( "The diagonal element of the triangular factor of A,\n" );
-                printf( "U(%i,%i) is zero, so that A is singular;\n", info, info );
+                printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info );
                 printf( "the solution could not be computed.\n" );
                 exit( 1 );
         }
diff --git a/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c b/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c
index 35bdcbcae..5411ef049 100644
--- a/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c
+++ b/lapack-netlib/LAPACKE/example/example_DGESV_rowmajor.c
@@ -25,11 +25,9 @@
 
    LAPACKE_dgesv (row-major, high-level) Example Program Results
 
-  -- LAPACKE Example routine (version 3.7.0) --
+  -- LAPACKE Example routine --
   -- LAPACK is a software package provided by Univ. of Tennessee,    --
   -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-     December 2016
-
 */
 #include <stdlib.h>
 #include <stdio.h>
@@ -91,7 +89,7 @@ int main(int argc, char **argv) {
         /* Check for the exact singularity */
         if( info > 0 ) {
                 printf( "The diagonal element of the triangular factor of A,\n" );
-                printf( "U(%i,%i) is zero, so that A is singular;\n", info, info );
+                printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info );
                 printf( "the solution could not be computed.\n" );
                 exit( 1 );
         }
diff --git a/lapack-netlib/LAPACKE/example/lapacke_example_aux.c b/lapack-netlib/LAPACKE/example/lapacke_example_aux.c
index 9b72eb620..19fff7905 100644
--- a/lapack-netlib/LAPACKE/example/lapacke_example_aux.c
+++ b/lapack-netlib/LAPACKE/example/lapacke_example_aux.c
@@ -28,6 +28,6 @@ void print_matrix_colmajor( char* desc, lapack_int m, lapack_int n, double* mat,
 void print_vector( char* desc, lapack_int n, lapack_int* vec ) {
         lapack_int j;
         printf( "\n %s\n", desc );
-        for( j = 0; j < n; j++ ) printf( " %6i", vec[j] );
+        for( j = 0; j < n; j++ ) printf( " %6" LAPACK_IFMT, vec[j] );
         printf( "\n" );
 }

From 4bc918a791d0b32a3e56b0b072d3d8cb72873a57 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 23:03:31 +0100
Subject: [PATCH 084/154] Add a BLAS3-based triangular Sylvester equation
 solver (Reference-LAPACK PR 651)

---
 lapack-netlib/LAPACKE/include/lapack.h  | 100 +++++++++++++++++++++++-
 lapack-netlib/LAPACKE/include/lapacke.h |  74 ++++++++++++++++++
 2 files changed, 171 insertions(+), 3 deletions(-)

diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index 14695fdc8..b5a276f5a 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -12,6 +12,7 @@
 
 #include <stdlib.h>
 #include <stdarg.h>
+#include <inttypes.h>
 
 /* It seems all current Fortran compilers put strlen at end.
 *  Some historical compilers put strlen after the str argument
@@ -80,11 +81,26 @@ extern "C" {
 
 /*----------------------------------------------------------------------------*/
 #ifndef lapack_int
-#define lapack_int     int
+#if defined(LAPACK_ILP64)
+#define lapack_int        int64_t
+#else
+#define lapack_int        int32_t
+#endif
+#endif
+
+/*
+ * Integer format string
+ */
+#ifndef LAPACK_IFMT
+#if defined(LAPACK_ILP64)
+#define LAPACK_IFMT       PRId64
+#else
+#define LAPACK_IFMT       PRId32
+#endif
 #endif
 
 #ifndef lapack_logical
-#define lapack_logical lapack_int
+#define lapack_logical    lapack_int
 #endif
 
 /* f2c, hence clapack and MacOS Accelerate, returns double instead of float
@@ -115,7 +131,7 @@ typedef lapack_logical (*LAPACK_Z_SELECT2)
     ( const lapack_complex_double*, const lapack_complex_double* );
 
 #define LAPACK_lsame_base LAPACK_GLOBAL(lsame,LSAME)
-lapack_logical LAPACK_lsame_base( const char* ca, const char* cb,
+lapack_logical LAPACK_lsame_base( const char* ca,  const char* cb,
                               lapack_int lca, lapack_int lcb
 #ifdef LAPACK_FORTRAN_STRLEN_END
     , size_t, size_t
@@ -21986,6 +22002,84 @@ void LAPACK_ztrsyl_base(
     #define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__)
 #endif
 
+#define LAPACK_ctrsyl3_base LAPACK_GLOBAL(ctrsyl3,CTRSYL3)
+void LAPACK_ctrsyl3_base(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* C, lapack_int const* ldc, float* scale,
+    float* swork, lapack_int const *ldswork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__, 1, 1)
+#else
+    #define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3)
+void LAPACK_dtrsyl3_base(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double const* B, lapack_int const* ldb,
+    double* C, lapack_int const* ldc, double* scale,
+    lapack_int* iwork, lapack_int const* liwork,
+    double* swork, lapack_int const *ldswork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__, 1, 1)
+#else
+    #define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_strsyl3_base LAPACK_GLOBAL(strsyl3,STRSYL3)
+void LAPACK_strsyl3_base(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float const* B, lapack_int const* ldb,
+    float* C, lapack_int const* ldc, float* scale,
+    lapack_int* iwork, lapack_int const* liwork,
+    float* swork, lapack_int const *ldswork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__, 1, 1)
+#else
+    #define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_ztrsyl3_base LAPACK_GLOBAL(ztrsyl3,ZTRSYL3)
+void LAPACK_ztrsyl3_base(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* C, lapack_int const* ldc, double* scale,
+    double* swork, lapack_int const *ldswork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__, 1, 1)
+#else
+    #define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__)
+#endif
+
 #define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI)
 void LAPACK_ctrtri_base(
     char const* uplo, char const* diag,
diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index f6fbfcc33..9998b1504 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -2313,6 +2313,19 @@ lapack_int LAPACKE_zlagge( int matrix_layout, lapack_int m, lapack_int n,
 float LAPACKE_slamch( char cmach );
 double LAPACKE_dlamch( char cmach );
 
+float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n,
+                      lapack_int kl, lapack_int ku, const float* ab,
+                      lapack_int ldab );
+double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n,
+                       lapack_int kl, lapack_int ku, const double* ab,
+                       lapack_int ldab );
+float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n,
+                      lapack_int kl, lapack_int ku,
+                      const lapack_complex_float* ab, lapack_int ldab );
+double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n,
+                       lapack_int kl, lapack_int ku,
+                       const lapack_complex_double* ab, lapack_int ldab );
+
 float LAPACKE_slange( int matrix_layout, char norm, lapack_int m,
                            lapack_int n, const float* a, lapack_int lda );
 double LAPACKE_dlange( int matrix_layout, char norm, lapack_int m,
@@ -4477,6 +4490,23 @@ lapack_int LAPACKE_ztrsyl( int matrix_layout, char trana, char tranb,
                            lapack_complex_double* c, lapack_int ldc,
                            double* scale );
 
+lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb,
+                            lapack_int isgn, lapack_int m, lapack_int n,
+                            const float* a, lapack_int lda, const float* b,
+                            lapack_int ldb, float* c, lapack_int ldc,
+                            float* scale );
+lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb,
+                            lapack_int isgn, lapack_int m, lapack_int n,
+                            const double* a, lapack_int lda, const double* b,
+                            lapack_int ldb, double* c, lapack_int ldc,
+                            double* scale );
+lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb,
+                            lapack_int isgn, lapack_int m, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* c, lapack_int ldc,
+                            double* scale );
+
 lapack_int LAPACKE_strtri( int matrix_layout, char uplo, char diag, lapack_int n,
                            float* a, lapack_int lda );
 lapack_int LAPACKE_dtrtri( int matrix_layout, char uplo, char diag, lapack_int n,
@@ -7576,6 +7606,21 @@ double LAPACKE_dlapy3_work( double x, double y, double z );
 float LAPACKE_slamch_work( char cmach );
 double LAPACKE_dlamch_work( char cmach );
 
+float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* ab,
+                           lapack_int ldab, float* work );
+double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n,
+                            lapack_int kl, lapack_int ku, const double* ab,
+                            lapack_int ldab, double* work );
+float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           float* work );
+double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n,
+                            lapack_int kl, lapack_int ku,
+                            const lapack_complex_double* ab, lapack_int ldab,
+                            double* work );
+
 float LAPACKE_slange_work( int matrix_layout, char norm, lapack_int m,
                                 lapack_int n, const float* a, lapack_int lda,
                                 float* work );
@@ -10174,6 +10219,35 @@ lapack_int LAPACKE_ztrsyl_work( int matrix_layout, char trana, char tranb,
                                 lapack_complex_double* c, lapack_int ldc,
                                 double* scale );
 
+lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb,
+                                 lapack_int isgn, lapack_int m, lapack_int n,
+                                 const float* a, lapack_int lda,
+                                 const float* b, lapack_int ldb,
+                                 float* c, lapack_int ldc, float* scale,
+                                 lapack_int* iwork, lapack_int liwork,
+                                 float* swork, lapack_int ldswork );
+lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb,
+                                 lapack_int isgn, lapack_int m, lapack_int n,
+                                 const double* a, lapack_int lda,
+                                 const double* b, lapack_int ldb,
+                                 double* c, lapack_int ldc, double* scale,
+                                 lapack_int* iwork, lapack_int liwork,
+                                 double* swork, lapack_int ldswork );
+lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb,
+                                 lapack_int isgn, lapack_int m, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* c, lapack_int ldc,
+                                 float* scale, float* swork,
+                                 lapack_int ldswork );
+lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb,
+                                 lapack_int isgn, lapack_int m, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* c, lapack_int ldc,
+                                 double* scale, double* swork,
+                                 lapack_int ldswork );
+
 lapack_int LAPACKE_strtri_work( int matrix_layout, char uplo, char diag,
                                 lapack_int n, float* a, lapack_int lda );
 lapack_int LAPACKE_dtrtri_work( int matrix_layout, char uplo, char diag,

From 7eb265326836d4480aa8a29cd93a3edc9b5c3b95 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 23:07:10 +0100
Subject: [PATCH 085/154] Add a BLAS3-based triangular Sylvester equation
 solver (Reference-LAPACK PR 651)

---
 lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c   |  1 -
 lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c   | 56 ++++++++++++
 .../LAPACKE/src/lapacke_ctrsyl3_work.c        | 88 +++++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c   |  1 -
 lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c   | 68 ++++++++++++++
 .../LAPACKE/src/lapacke_dtrsyl3_work.c        | 86 ++++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c   |  1 -
 lapack-netlib/LAPACKE/src/lapacke_strsyl3.c   | 68 ++++++++++++++
 .../LAPACKE/src/lapacke_strsyl3_work.c        | 86 ++++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c   |  1 -
 lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c   | 56 ++++++++++++
 .../LAPACKE/src/lapacke_ztrsyl3_work.c        | 88 +++++++++++++++++++
 12 files changed, 596 insertions(+), 4 deletions(-)
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_strsyl3.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c

diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
index 8406635e9..05ff8d57f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
@@ -48,7 +48,6 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp,
     lapack_int lrwork = -1;
     float* rwork = NULL;
     float rwork_query;
-    lapack_int i;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 );
         return -1;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c
new file mode 100644
index 000000000..c931aac48
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3.c
@@ -0,0 +1,56 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_ctrsyl3( int matrix_layout, char trana, char tranb,
+                            lapack_int isgn, lapack_int m, lapack_int n,
+                            const lapack_complex_float* a, lapack_int lda,
+                            const lapack_complex_float* b, lapack_int ldb,
+                            lapack_complex_float* c, lapack_int ldc,
+                            float* scale )
+{
+    lapack_int info = 0;
+    float swork_query[2];
+    float* swork = NULL;
+    lapack_int ldswork = -1;
+    lapack_int swork_size = -1;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_ctrsyl3", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, m, a, lda ) ) {
+            return -7;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+            return -9;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
+            return -11;
+        }
+    }
+#endif
+    /* Query optimal working array sizes */
+    info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda,
+                                 b, ldb, c, ldc, scale, swork_query, ldswork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    ldswork = swork_query[0];
+    swork_size = ldswork * swork_query[1];
+    swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size);
+    if( swork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a,
+                                 lda, b, ldb, c, ldc, scale, swork, ldswork );
+    /* Release memory and exit */
+    LAPACKE_free( swork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_ctrsyl3", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c
new file mode 100644
index 000000000..09c08d92a
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctrsyl3_work.c
@@ -0,0 +1,88 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb,
+                                 lapack_int isgn, lapack_int m, lapack_int n,
+                                 const lapack_complex_float* a, lapack_int lda,
+                                 const lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* c, lapack_int ldc,
+                                 float* scale, float* swork,
+                                 lapack_int ldswork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc,
+                        scale, swork, &ldswork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,n);
+        lapack_int ldc_t = MAX(1,m);
+        lapack_complex_float* a_t = NULL;
+        lapack_complex_float* b_t = NULL;
+        lapack_complex_float* c_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < m ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info );
+            return info;
+        }
+        if( ldc < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        b_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        c_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * ldc_t * MAX(1,n) );
+        if( c_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, m, a, lda, a_t, lda_t );
+        LAPACKE_cge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t,
+                        c_t, &ldc_t, scale, swork, &ldswork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc );
+        /* Release memory and exit */
+        LAPACKE_free( c_t );
+exit_level_2:
+        LAPACKE_free( b_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c
index 4e1b87681..4a0d427b3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c
@@ -48,7 +48,6 @@ lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp,
     lapack_int lrwork = -1;
     double* rwork = NULL;
     double rwork_query;
-    lapack_int i;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 );
         return -1;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c
new file mode 100644
index 000000000..c95a772de
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3.c
@@ -0,0 +1,68 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb,
+                            lapack_int isgn, lapack_int m, lapack_int n,
+                            const double* a, lapack_int lda, const double* b,
+                            lapack_int ldb, double* c, lapack_int ldc,
+                            double* scale )
+{
+    lapack_int info = 0;
+    double swork_query[2];
+    double* swork = NULL;
+    lapack_int ldswork = -1;
+    lapack_int swork_size = -1;
+    lapack_int iwork_query;
+    lapack_int* iwork = NULL;
+    lapack_int liwork = -1;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dtrsyl3", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, m, a, lda ) ) {
+            return -7;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+            return -9;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
+            return -11;
+        }
+    }
+#endif
+    /* Query optimal working array sizes */
+    info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda,
+                                 b, ldb, c, ldc, scale, &iwork_query, liwork,
+                                 swork_query, ldswork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    ldswork = swork_query[0];
+    swork_size = ldswork * swork_query[1];
+    swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size);
+    if( swork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    liwork = iwork_query;
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if ( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dtrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a,
+                                 lda, b, ldb, c, ldc, scale, iwork, liwork,
+                                 swork, ldswork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( swork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dtrsyl3", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c
new file mode 100644
index 000000000..272c35b38
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsyl3_work.c
@@ -0,0 +1,86 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb,
+                                 lapack_int isgn, lapack_int m, lapack_int n,
+                                 const double* a, lapack_int lda,
+                                 const double* b, lapack_int ldb, double* c,
+                                 lapack_int ldc, double* scale,
+                                 lapack_int* iwork, lapack_int liwork,
+                                 double* swork, lapack_int ldswork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc,
+                        scale, iwork, &liwork, swork, &ldswork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,n);
+        lapack_int ldc_t = MAX(1,m);
+        double* a_t = NULL;
+        double* b_t = NULL;
+        double* c_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < m ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info );
+            return info;
+        }
+        if( ldc < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        c_t = (double*)LAPACKE_malloc( sizeof(double) * ldc_t * MAX(1,n) );
+        if( c_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, m, a, lda, a_t, lda_t );
+        LAPACKE_dge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dtrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t,
+                        c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork,
+                        &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc );
+        /* Release memory and exit */
+        LAPACKE_free( c_t );
+exit_level_2:
+        LAPACKE_free( b_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dtrsyl3_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c
index 0b6406dec..627d2406c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c
@@ -48,7 +48,6 @@ lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp,
     lapack_int lrwork = -1;
     float* rwork = NULL;
     float rwork_query;
-    lapack_int i;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 );
         return -1;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c
new file mode 100644
index 000000000..1cfc626c2
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_strsyl3.c
@@ -0,0 +1,68 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb,
+                            lapack_int isgn, lapack_int m, lapack_int n,
+                            const float* a, lapack_int lda, const float* b,
+                            lapack_int ldb, float* c, lapack_int ldc,
+                            float* scale )
+{
+    lapack_int info = 0;
+    float swork_query[2];
+    float* swork = NULL;
+    lapack_int ldswork = -1;
+    lapack_int swork_size = -1;
+    lapack_int iwork_query;
+    lapack_int* iwork = NULL;
+    lapack_int liwork = -1;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_strsyl3", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, m, a, lda ) ) {
+            return -7;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+            return -9;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
+            return -11;
+        }
+    }
+#endif
+    /* Query optimal working array sizes */
+    info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda,
+                                 b, ldb, c, ldc, scale, &iwork_query, liwork,
+                                 swork_query, ldswork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    ldswork = swork_query[0];
+    swork_size = ldswork * swork_query[1];
+    swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size);
+    if( swork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    liwork = iwork_query;
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if ( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_strsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a,
+                                 lda, b, ldb, c, ldc, scale, iwork, liwork,
+                                 swork, ldswork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( swork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_strsyl3", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c
new file mode 100644
index 000000000..3c50e4a45
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_strsyl3_work.c
@@ -0,0 +1,86 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb,
+                                 lapack_int isgn, lapack_int m, lapack_int n,
+                                 const float* a, lapack_int lda,
+                                 const float* b, lapack_int ldb, float* c,
+                                 lapack_int ldc, float* scale,
+                                 lapack_int* iwork, lapack_int liwork,
+                                 float* swork, lapack_int ldswork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc,
+                        scale, iwork, &liwork, swork, &ldswork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,n);
+        lapack_int ldc_t = MAX(1,m);
+        float* a_t = NULL;
+        float* b_t = NULL;
+        float* c_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < m ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_strsyl3_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_strsyl3_work", info );
+            return info;
+        }
+        if( ldc < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_strsyl3_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        c_t = (float*)LAPACKE_malloc( sizeof(float) * ldc_t * MAX(1,n) );
+        if( c_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, m, a, lda, a_t, lda_t );
+        LAPACKE_sge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_strsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t,
+                        c_t, &ldc_t, scale, iwork, &liwork, swork, &ldswork,
+                        &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc );
+        /* Release memory and exit */
+        LAPACKE_free( c_t );
+exit_level_2:
+        LAPACKE_free( b_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_strsyl3_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_strsyl3_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
index 528b94a47..1d318e571 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
@@ -48,7 +48,6 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp,
     lapack_int lrwork = -1;
     double* rwork = NULL;
     double rwork_query;
-    lapack_int i;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 );
         return -1;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c
new file mode 100644
index 000000000..dbc9bcf9f
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3.c
@@ -0,0 +1,56 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb,
+                            lapack_int isgn, lapack_int m, lapack_int n,
+                            const lapack_complex_double* a, lapack_int lda,
+                            const lapack_complex_double* b, lapack_int ldb,
+                            lapack_complex_double* c, lapack_int ldc,
+                            double* scale )
+{
+    lapack_int info = 0;
+    double swork_query[2];
+    double* swork = NULL;
+    lapack_int ldswork = -1;
+    lapack_int swork_size = -1;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_ztrsyl3", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, m, a, lda ) ) {
+            return -7;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) {
+            return -9;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
+            return -11;
+        }
+    }
+#endif
+    /* Query optimal working array sizes */
+    info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda,
+                                 b, ldb, c, ldc, scale, swork_query, ldswork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    ldswork = swork_query[0];
+    swork_size = ldswork * swork_query[1];
+    swork = (double*)LAPACKE_malloc( sizeof(double) * swork_size);
+    if( swork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_ztrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a,
+                                 lda, b, ldb, c, ldc, scale, swork, ldswork );
+    /* Release memory and exit */
+    LAPACKE_free( swork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_ztrsyl3", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c
new file mode 100644
index 000000000..a7ebd5da6
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_ztrsyl3_work.c
@@ -0,0 +1,88 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb,
+                                 lapack_int isgn, lapack_int m, lapack_int n,
+                                 const lapack_complex_double* a, lapack_int lda,
+                                 const lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* c, lapack_int ldc,
+                                 double* scale, double* swork,
+                                 lapack_int ldswork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc,
+                        scale, swork, &ldswork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,n);
+        lapack_int ldc_t = MAX(1,m);
+        lapack_complex_double* a_t = NULL;
+        lapack_complex_double* b_t = NULL;
+        lapack_complex_double* c_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < m ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -10;
+            LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info );
+            return info;
+        }
+        if( ldc < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        b_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        c_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * ldc_t * MAX(1,n) );
+        if( c_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, m, a, lda, a_t, lda_t );
+        LAPACKE_zge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_ztrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t,
+                        c_t, &ldc_t, scale, swork, &ldswork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc );
+        /* Release memory and exit */
+        LAPACKE_free( c_t );
+exit_level_2:
+        LAPACKE_free( b_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_ztrsyl3_work", info );
+    }
+    return info;
+}

From 6eb707d94110f800b6b10b78b648fc3e41672d01 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 23:10:13 +0100
Subject: [PATCH 086/154] Add a BLAS3-based triangular Sylvester equation
 solver (Reference-LAPACK PR 651)

---
 lapack-netlib/SRC/clatrs3.f |  666 +++++++++++++++++++
 lapack-netlib/SRC/ctrsyl3.f | 1142 ++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dlarmm.f  |   99 +++
 lapack-netlib/SRC/dlatrs3.f |  656 ++++++++++++++++++
 lapack-netlib/SRC/dtrsyl3.f | 1241 ++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/ilaenv.f  |   15 +
 lapack-netlib/SRC/slarmm.f  |   99 +++
 lapack-netlib/SRC/slatrs3.f |  656 ++++++++++++++++++
 lapack-netlib/SRC/strsyl3.f | 1244 +++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zlatrs3.f |  667 +++++++++++++++++++
 lapack-netlib/SRC/ztrsyl3.f | 1142 ++++++++++++++++++++++++++++++++
 11 files changed, 7627 insertions(+)
 create mode 100644 lapack-netlib/SRC/clatrs3.f
 create mode 100644 lapack-netlib/SRC/ctrsyl3.f
 create mode 100644 lapack-netlib/SRC/dlarmm.f
 create mode 100644 lapack-netlib/SRC/dlatrs3.f
 create mode 100644 lapack-netlib/SRC/dtrsyl3.f
 create mode 100644 lapack-netlib/SRC/slarmm.f
 create mode 100644 lapack-netlib/SRC/slatrs3.f
 create mode 100644 lapack-netlib/SRC/strsyl3.f
 create mode 100644 lapack-netlib/SRC/zlatrs3.f
 create mode 100644 lapack-netlib/SRC/ztrsyl3.f

diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f
new file mode 100644
index 000000000..a902f1ed0
--- /dev/null
+++ b/lapack-netlib/SRC/clatrs3.f
@@ -0,0 +1,666 @@
+*> \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
+*                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          DIAG, NORMIN, TRANS, UPLO
+*       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
+*       ..
+*       .. Array Arguments ..
+*       REAL               CNORM( * ), SCALE( * ), WORK( * )
+*       COMPLEX            A( LDA, * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLATRS3 solves one of the triangular systems
+*>
+*>    A * X = B * diag(scale),  A**T * X = B * diag(scale), or
+*>    A**H * X = B * diag(scale)
+*>
+*> with scaling to prevent overflow.  Here A is an upper or lower
+*> triangular matrix, A**T denotes the transpose of A, A**H denotes the
+*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale
+*> is an nrhs-element vector of scaling factors. A scaling factor scale(j)
+*> is usually less than or equal to 1, chosen such that X(:,j) is less
+*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0
+*> for some j), then a non-trivial solution to A*X = 0 is returned. If
+*> the system is so badly scaled that the solution cannot be represented
+*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
+*>
+*> This is a BLAS-3 version of LATRS for solving several right
+*> hand sides simultaneously.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          Specifies whether the matrix A is upper or lower triangular.
+*>          = 'U':  Upper triangular
+*>          = 'L':  Lower triangular
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the operation applied to A.
+*>          = 'N':  Solve A * x = s*b  (No transpose)
+*>          = 'T':  Solve A**T* x = s*b  (Transpose)
+*>          = 'C':  Solve A**T* x = s*b  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIAG
+*> \verbatim
+*>          DIAG is CHARACTER*1
+*>          Specifies whether or not the matrix A is unit triangular.
+*>          = 'N':  Non-unit triangular
+*>          = 'U':  Unit triangular
+*> \endverbatim
+*>
+*> \param[in] NORMIN
+*> \verbatim
+*>          NORMIN is CHARACTER*1
+*>          Specifies whether CNORM has been set or not.
+*>          = 'Y':  CNORM contains the column norms on entry
+*>          = 'N':  CNORM is not set on entry.  On exit, the norms will
+*>                  be computed and stored in CNORM.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of columns of X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          The triangular matrix A.  If UPLO = 'U', the leading n by n
+*>          upper triangular part of the array A contains the upper
+*>          triangular matrix, and the strictly lower triangular part of
+*>          A is not referenced.  If UPLO = 'L', the leading n by n lower
+*>          triangular part of the array A contains the lower triangular
+*>          matrix, and the strictly upper triangular part of A is not
+*>          referenced.  If DIAG = 'U', the diagonal elements of A are
+*>          also not referenced and are assumed to be 1.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max (1,N).
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension (LDX,NRHS)
+*>          On entry, the right hand side B of the triangular system.
+*>          On exit, X is overwritten by the solution matrix X.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max (1,N).
+*> \endverbatim
+*>
+*> \param[out] SCALE
+*> \verbatim
+*>          SCALE is REAL array, dimension (NRHS)
+*>          The scaling factor s(k) is for the triangular system
+*>          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k).
+*>          If SCALE = 0, the matrix A is singular or badly scaled.
+*>          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
+*>          that is an exact or approximate solution to A*x(:,k) = 0
+*>          is returned. If the system so badly scaled that solution
+*>          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
+*>          is returned.
+*> \endverbatim
+*>
+*> \param[in,out] CNORM
+*> \verbatim
+*>          CNORM is REAL array, dimension (N)
+*>
+*>          If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
+*>          contains the norm of the off-diagonal part of the j-th column
+*>          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal
+*>          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
+*>          must be greater than or equal to the 1-norm.
+*>
+*>          If NORMIN = 'N', CNORM is an output argument and CNORM(j)
+*>          returns the 1-norm of the offdiagonal part of the j-th column
+*>          of A.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (LWORK).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal size of
+*>          WORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*>          LWORK is INTEGER
+*>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
+*>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal dimensions of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -k, the k-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleOTHERauxiliary
+*> \par Further Details:
+*  =====================
+*  \verbatim
+*  The algorithm follows the structure of a block triangular solve.
+*  The diagonal block is solved with a call to the robust the triangular
+*  solver LATRS for every right-hand side RHS = 1, ..., NRHS
+*     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
+*  where op( A ) = A or op( A ) = A**T or op( A ) = A**H.
+*  The linear block updates operate on block columns of X,
+*     B( I, K ) - op(A( I, J )) * X( J, K )
+*  and use GEMM. To avoid overflow in the linear block update, the worst case
+*  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
+*  such that
+*     || s * B( I, RHS )||_oo
+*   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold
+*
+*  Once all columns of a block column have been rescaled (BLAS-1), the linear
+*  update is executed with GEMM without overflow.
+*
+*  To limit rescaling, local scale factors track the scaling of column segments.
+*  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
+*  per right-hand side column RHS = 1, ..., NRHS. The global scale factor
+*  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
+*  I = 1, ..., NBA.
+*  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
+*  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
+*  linear update of potentially inconsistently scaled vector segments
+*     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
+*  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
+*  if necessary, rescales the blocks prior to calling GEMM.
+*
+*  \endverbatim
+*  =====================================================================
+*  References:
+*  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
+*  Parallel robust solution of triangular linear systems. Concurrency
+*  and Computation: Practice and Experience, 31(19), e5064.
+*
+*  Contributor:
+*   Angelika Schwarz, Umea University, Sweden.
+*
+*  =====================================================================
+      SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
+     $                    X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
+      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIAG, TRANS, NORMIN, UPLO
+      INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), X( LDX, * )
+      REAL               CNORM( * ), SCALE( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      COMPLEX            CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ) )
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
+      INTEGER            NBMAX, NBMIN, NBRHS, NRHSMIN
+      PARAMETER          ( NRHSMIN = 2, NBRHS = 32 )
+      PARAMETER          ( NBMIN = 8, NBMAX = 64 )
+*     ..
+*     .. Local Arrays ..
+      REAL               W( NBMAX ), XNRM( NBRHS )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
+      INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
+     $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+      REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
+     $                   SCAMIN, SMLNUM, TMAX
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, CLANGE, SLARMM
+      EXTERNAL           ILAENV, LSAME, SLAMCH, CLANGE, SLARMM
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLATRS, CSSCAL, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      UPPER = LSAME( UPLO, 'U' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      NOUNIT = LSAME( DIAG, 'N' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+*     Partition A and X into blocks.
+*
+      NB = MAX( NBMIN, ILAENV( 1, 'CLATRS', '', N, N, -1, -1 ) )
+      NB = MIN( NBMAX, NB )
+      NBA = MAX( 1, (N + NB - 1) / NB )
+      NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
+*
+*     Compute the workspace
+*
+*     The workspace comprises two parts.
+*     The first part stores the local scale factors. Each simultaneously
+*     computed right-hand side requires one local scale factor per block
+*     row. WORK( I + KK * LDS ) is the scale factor of the vector
+*     segment associated with the I-th block row and the KK-th vector
+*     in the block column.
+      LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
+      LDS = NBA
+*     The second part stores upper bounds of the triangular A. There are
+*     a total of NBA x NBA blocks, of which only the upper triangular
+*     part or the lower triangular part is referenced. The upper bound of
+*     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+      LANRM = NBA * NBA
+      AWRK = LSCALE
+      WORK( 1 ) = LSCALE + LANRM
+*
+*     Test the input parameters.
+*
+      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
+         INFO = -3
+      ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
+     $         LSAME( NORMIN, 'N' ) ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+         INFO = -10
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+         INFO = -14
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CLATRS3', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Initialize scaling factors
+*
+      DO KK = 1, NRHS
+         SCALE( KK ) = ONE
+      END DO
+*
+*     Quick return if possible
+*
+      IF( MIN( N, NRHS ).EQ.0 )
+     $   RETURN
+*
+*     Determine machine dependent constant to control overflow.
+*
+      BIGNUM = SLAMCH( 'Overflow' )
+      SMLNUM = SLAMCH( 'Safe Minimum' )
+*
+*     Use unblocked code for small problems
+*
+      IF( NRHS.LT.NRHSMIN ) THEN
+         CALL CLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1 ),
+     $                SCALE( 1 ), CNORM, INFO )
+         DO K = 2, NRHS
+            CALL CLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
+     $                   SCALE( K ), CNORM, INFO )
+         END DO
+         RETURN
+      END IF
+*
+*     Compute norms of blocks of A excluding diagonal blocks and find
+*     the block with the largest norm TMAX.
+*
+      TMAX = ZERO
+      DO J = 1, NBA
+         J1 = (J-1)*NB + 1
+         J2 = MIN( J*NB, N ) + 1
+         IF ( UPPER ) THEN
+            IFIRST = 1
+            ILAST = J - 1
+         ELSE
+            IFIRST = J + 1
+            ILAST = NBA
+         END IF
+         DO I = IFIRST, ILAST
+            I1 = (I-1)*NB + 1
+            I2 = MIN( I*NB, N ) + 1
+*
+*           Compute upper bound of A( I1:I2-1, J1:J2-1 ).
+*
+            IF( NOTRAN ) THEN
+               ANRM = CLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
+               WORK( AWRK + I+(J-1)*NBA ) = ANRM
+            ELSE
+               ANRM = CLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
+               WORK( AWRK + J+(I-1)*NBA ) = ANRM
+            END IF
+            TMAX = MAX( TMAX, ANRM )
+         END DO
+      END DO
+*
+      IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN
+*
+*        Some matrix entries have huge absolute value. At least one upper
+*        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
+*        number, either due to overflow in LANGE or due to Inf in A.
+*        Fall back to LATRS. Set normin = 'N' for every right-hand side to
+*        force computation of TSCAL in LATRS to avoid the likely overflow
+*        in the computation of the column norms CNORM.
+*
+         DO K = 1, NRHS
+            CALL CLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
+     $                   SCALE( K ), CNORM, INFO )
+         END DO
+         RETURN
+      END IF
+*
+*     Every right-hand side requires workspace to store NBA local scale
+*     factors. To save workspace, X is computed successively in block columns
+*     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
+*     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
+      DO K = 1, NBX
+*        Loop over block columns (index = K) of X and, for column-wise scalings,
+*        over individual columns (index = KK).
+*        K1: column index of the first column in X( J, K )
+*        K2: column index of the first column in X( J, K+1 )
+*        so the K2 - K1 is the column count of the block X( J, K )
+         K1 = (K-1)*NBRHS + 1
+         K2 = MIN( K*NBRHS, NRHS ) + 1
+*
+*        Initialize local scaling factors of current block column X( J, K )
+*
+         DO KK = 1, K2-K1
+            DO I = 1, NBA
+               WORK( I+KK*LDS ) = ONE
+            END DO
+         END DO
+*
+         IF( NOTRAN ) THEN
+*
+*           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
+*
+            IF( UPPER ) THEN
+               JFIRST = NBA
+               JLAST = 1
+               JINC = -1
+            ELSE
+               JFIRST = 1
+               JLAST = NBA
+               JINC = 1
+            END IF
+         ELSE
+*
+*           Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
+*           where op(A) = A**T or op(A) = A**H
+*
+            IF( UPPER ) THEN
+               JFIRST = 1
+               JLAST = NBA
+               JINC = 1
+            ELSE
+               JFIRST = NBA
+               JLAST = 1
+               JINC = -1
+            END IF
+         END IF
+
+         DO J = JFIRST, JLAST, JINC
+*           J1: row index of the first row in A( J, J )
+*           J2: row index of the first row in A( J+1, J+1 )
+*           so that J2 - J1 is the row count of the block A( J, J )
+            J1 = (J-1)*NB + 1
+            J2 = MIN( J*NB, N ) + 1
+*
+*           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
+*
+            DO KK = 1, K2-K1
+               RHS = K1 + KK - 1
+               IF( KK.EQ.1 ) THEN
+                  CALL CLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
+     $                         A( J1, J1 ), LDA, X( J1, RHS ),
+     $                         SCALOC, CNORM, INFO )
+               ELSE
+                  CALL CLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
+     $                         A( J1, J1 ), LDA, X( J1, RHS ),
+     $                         SCALOC, CNORM, INFO )
+               END IF
+*              Find largest absolute value entry in the vector segment
+*              X( J1:J2-1, RHS ) as an upper bound for the worst case
+*              growth in the linear updates.
+               XNRM( KK ) = CLANGE( 'I', J2-J1, 1, X( J1, RHS ),
+     $                              LDX, W )
+*
+               IF( SCALOC .EQ. ZERO ) THEN
+*                 LATRS found that A is singular through A(j,j) = 0.
+*                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
+*                 and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is
+*                 set by LATRS.
+                  SCALE( RHS ) = ZERO
+                  DO II = 1, J1-1
+                     X( II, KK ) = CZERO
+                  END DO
+                  DO II = J2, N
+                     X( II, KK ) = CZERO
+                  END DO
+*                 Discard the local scale factors.
+                  DO II = 1, NBA
+                     WORK( II+KK*LDS ) = ONE
+                  END DO
+                  SCALOC = ONE
+               ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN
+*                 LATRS computed a valid scale factor, but combined with
+*                 the current scaling the solution does not have a
+*                 scale factor > 0.
+*
+*                 Set WORK( J+KK*LDS ) to smallest valid scale
+*                 factor and increase SCALOC accordingly.
+                  SCAL = WORK( J+KK*LDS ) / SMLNUM
+                  SCALOC = SCALOC * SCAL
+                  WORK( J+KK*LDS ) = SMLNUM
+*                 If LATRS overestimated the growth, x may be
+*                 rescaled to preserve a valid combined scale
+*                 factor WORK( J, KK ) > 0.
+                  RSCAL = ONE / SCALOC
+                  IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN
+                     XNRM( KK ) = XNRM( KK ) * RSCAL
+                     CALL CSSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
+                     SCALOC = ONE
+                  ELSE
+*                    The system op(A) * x = b is badly scaled and its
+*                    solution cannot be represented as (1/scale) * x.
+*                    Set x to zero. This approach deviates from LATRS
+*                    where a completely meaningless non-zero vector
+*                    is returned that is not a solution to op(A) * x = b.
+                     SCALE( RHS ) = ZERO
+                     DO II = 1, N
+                        X( II, KK ) = CZERO
+                     END DO
+*                    Discard the local scale factors.
+                     DO II = 1, NBA
+                        WORK( II+KK*LDS ) = ONE
+                     END DO
+                     SCALOC = ONE
+                  END IF
+               END IF
+               SCALOC = SCALOC * WORK( J+KK*LDS )
+               WORK( J+KK*LDS ) = SCALOC
+            END DO
+*
+*           Linear block updates
+*
+            IF( NOTRAN ) THEN
+               IF( UPPER ) THEN
+                  IFIRST = J - 1
+                  ILAST = 1
+                  IINC = -1
+               ELSE
+                  IFIRST = J + 1
+                  ILAST = NBA
+                  IINC = 1
+               END IF
+            ELSE
+               IF( UPPER ) THEN
+                  IFIRST = J + 1
+                  ILAST = NBA
+                  IINC = 1
+               ELSE
+                  IFIRST = J - 1
+                  ILAST = 1
+                  IINC = -1
+               END IF
+            END IF
+*
+            DO I = IFIRST, ILAST, IINC
+*              I1: row index of the first column in X( I, K )
+*              I2: row index of the first column in X( I+1, K )
+*              so the I2 - I1 is the row count of the block X( I, K )
+               I1 = (I-1)*NB + 1
+               I2 = MIN( I*NB, N ) + 1
+*
+*              Prepare the linear update to be executed with GEMM.
+*              For each column, compute a consistent scaling, a
+*              scaling factor to survive the linear update, and
+*              rescale the column segments, if necesssary. Then
+*              the linear update is safely executed.
+*
+               DO KK = 1, K2-K1
+                  RHS = K1 + KK - 1
+*                 Compute consistent scaling
+                  SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  BNRM = CLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
+                  BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
+                  XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) )
+                  ANRM = WORK( AWRK + I+(J-1)*NBA )
+                  SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM )
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to X( I, KK ) and X( J, KK ).
+*
+                  SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
+                  IF( SCAL.NE.ONE ) THEN
+                     CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
+                     WORK( I+KK*LDS ) = SCAMIN*SCALOC
+                  END IF
+*
+                  SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
+                  IF( SCAL.NE.ONE ) THEN
+                     CALL CSSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
+                     WORK( J+KK*LDS ) = SCAMIN*SCALOC
+                  END IF
+               END DO
+*
+               IF( NOTRAN ) THEN
+*
+*                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
+*
+                  CALL CGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE,
+     $                        A( I1, J1 ), LDA, X( J1, K1 ), LDX,
+     $                        CONE, X( I1, K1 ), LDX )
+               ELSE IF( LSAME( TRANS, 'T' ) ) THEN
+*
+*                 B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K )
+*
+                  CALL CGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE,
+     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
+     $                        CONE, X( I1, K1 ), LDX )
+               ELSE
+*
+*                 B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K )
+*
+                  CALL CGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE,
+     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
+     $                        CONE, X( I1, K1 ), LDX )
+               END IF
+            END DO
+         END DO
+*
+*        Reduce local scaling factors
+*
+         DO KK = 1, K2-K1
+            RHS = K1 + KK - 1
+            DO I = 1, NBA
+               SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
+            END DO
+         END DO
+*
+*        Realize consistent scaling
+*
+         DO KK = 1, K2-K1
+            RHS = K1 + KK - 1
+            IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
+               DO I = 1, NBA
+                  I1 = (I-1)*NB + 1
+                  I2 = MIN( I*NB, N ) + 1
+                  SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
+                  IF( SCAL.NE.ONE )
+     $               CALL CSSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
+               END DO
+            END IF
+         END DO
+      END DO
+      RETURN
+*
+*     End of CLATRS3
+*
+      END
diff --git a/lapack-netlib/SRC/ctrsyl3.f b/lapack-netlib/SRC/ctrsyl3.f
new file mode 100644
index 000000000..586dc0207
--- /dev/null
+++ b/lapack-netlib/SRC/ctrsyl3.f
@@ -0,0 +1,1142 @@
+*> \brief \b CTRSYL3
+*
+* Definition:
+* ===========
+*
+*
+*>  \par Purpose
+*  =============
+*>
+*> \verbatim
+*>
+*>  CTRSYL3 solves the complex Sylvester matrix equation:
+*>
+*>     op(A)*X + X*op(B) = scale*C or
+*>     op(A)*X - X*op(B) = scale*C,
+*>
+*>  where op(A) = A or A**H, and  A and B are both upper triangular. A is
+*>  M-by-M and B is N-by-N; the right hand side C and the solution X are
+*>  M-by-N; and scale is an output scale factor, set <= 1 to avoid
+*>  overflow in X.
+*>
+*>  This is the block version of the algorithm.
+*> \endverbatim
+*
+*  Arguments
+*  =========
+*
+*> \param[in] TRANA
+*> \verbatim
+*>          TRANA is CHARACTER*1
+*>          Specifies the option op(A):
+*>          = 'N': op(A) = A    (No transpose)
+*>          = 'C': op(A) = A**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] TRANB
+*> \verbatim
+*>          TRANB is CHARACTER*1
+*>          Specifies the option op(B):
+*>          = 'N': op(B) = B    (No transpose)
+*>          = 'C': op(B) = B**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] ISGN
+*> \verbatim
+*>          ISGN is INTEGER
+*>          Specifies the sign in the equation:
+*>          = +1: solve op(A)*X + X*op(B) = scale*C
+*>          = -1: solve op(A)*X - X*op(B) = scale*C
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The order of the matrix A, and the number of rows in the
+*>          matrices X and C. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix B, and the number of columns in the
+*>          matrices X and C. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,M)
+*>          The upper triangular matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,N)
+*>          The upper triangular matrix B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX array, dimension (LDC,N)
+*>          On entry, the M-by-N right hand side matrix C.
+*>          On exit, C is overwritten by the solution matrix X.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M)
+*> \endverbatim
+*>
+*> \param[out] SCALE
+*> \verbatim
+*>          SCALE is REAL
+*>          The scale factor, scale, set <= 1 to avoid overflow in X.
+*> \endverbatim
+*>
+*> \param[out] SWORK
+*> \verbatim
+*>          SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)).
+*>          On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS
+*>          and SWORK(2) returns the optimal COLS.
+*> \endverbatim
+*>
+*> \param[in] LDSWORK
+*> \verbatim
+*>          LDSWORK is INTEGER
+*>          LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1)
+*>          and NB is the optimal block size.
+*>
+*>          If LDSWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal dimensions of the SWORK matrix,
+*>          returns these values as the first and second entry of the SWORK
+*>          matrix, and no error message related LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0: successful exit
+*>          < 0: if INFO = -i, the i-th argument had an illegal value
+*>          = 1: A and B have common or very close eigenvalues; perturbed
+*>               values were used to solve the equation (but the matrices
+*>               A and B are unchanged).
+*> \endverbatim
+*
+*> \ingroup complexSYcomputational
+*
+*  =====================================================================
+*  References:
+*   E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of
+*   algorithms: The triangular Sylvester equation, ACM Transactions
+*   on Mathematical Software (TOMS), volume 29, pages 218--243.
+*
+*   A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel
+*   Solution of the Triangular Sylvester Equation. Lecture Notes in
+*   Computer Science, vol 12043, pages 82--92, Springer.
+*
+*  Contributor:
+*   Angelika Schwarz, Umea University, Sweden.
+*
+*  =====================================================================
+      SUBROUTINE CTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C,
+     $                    LDC, SCALE, SWORK, LDSWORK, INFO )
+      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+      CHARACTER          TRANA, TRANB
+      INTEGER            INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N
+      REAL               SCALE
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * ), C( LDC, * )
+      REAL               SWORK( LDSWORK, * )
+*     ..
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      COMPLEX            CONE
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            NOTRNA, NOTRNB, LQUERY
+      INTEGER            AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ,
+     $                   K, K1, K2, L, L1, L2, LL, NBA, NB, NBB
+      REAL               ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC,
+     $                   SCAMIN, SGN, XNRM, BUF, SMLNUM
+      COMPLEX            CSGN
+*     ..
+*     .. Local Arrays ..
+      REAL               WNRM( MAX( M, N ) )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               CLANGE, SLAMCH, SLARMM
+      EXTERNAL           CLANGE, ILAENV, LSAME, SLAMCH, SLARMM
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CSSCAL, CGEMM, CLASCL, CTRSYL, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, AIMAG, EXPONENT, MAX, MIN, REAL
+*     ..
+*     .. Executable Statements ..
+*
+*     Decode and Test input parameters
+*
+      NOTRNA = LSAME( TRANA, 'N' )
+      NOTRNB = LSAME( TRANB, 'N' )
+*
+*     Use the same block size for all matrices.
+*
+      NB = MAX( 8, ILAENV( 1, 'CTRSYL', '', M, N, -1, -1) )
+*
+*     Compute number of blocks in A and B
+*
+      NBA = MAX( 1, (M + NB - 1) / NB )
+      NBB = MAX( 1, (N + NB - 1) / NB )
+*
+*     Compute workspace
+*
+      INFO = 0
+      LQUERY = ( LDSWORK.EQ.-1 )
+      IF( LQUERY ) THEN
+         LDSWORK = 2
+         SWORK(1,1) = MAX( NBA, NBB )
+         SWORK(2,1) = 2 * NBB + NBA
+      END IF
+*
+*     Test the input arguments
+*
+      IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN
+         INFO = -3
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+         INFO = -9
+      ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
+         INFO = -11
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CTRSYL3', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      SCALE = ONE
+      IF( M.EQ.0 .OR. N.EQ.0 )
+     $   RETURN
+*
+*     Use unblocked code for small problems or if insufficient
+*     workspace is provided
+*
+      IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN
+        CALL CTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB,
+     $               C, LDC, SCALE, INFO )
+        RETURN
+      END IF
+*
+*     Set constants to control overflow
+*
+      SMLNUM = SLAMCH( 'S' )
+      BIGNUM = ONE / SMLNUM
+*
+*     Set local scaling factors.
+*
+      DO L = 1, NBB
+         DO K = 1, NBA
+            SWORK( K, L ) = ONE
+         END DO
+      END DO
+*
+*     Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero.
+*     This scaling is to ensure compatibility with TRSYL and may get flushed.
+*
+      BUF = ONE
+*
+*      Compute upper bounds of blocks of A and B
+*
+      AWRK = NBB
+      DO K = 1, NBA
+         K1 = (K - 1) * NB + 1
+         K2 = MIN( K * NB, M ) + 1
+         DO L = K, NBA
+            L1 = (L - 1) * NB + 1
+            L2 = MIN( L * NB, M ) + 1
+            IF( NOTRNA ) THEN
+               SWORK( K, AWRK + L ) = CLANGE( 'I', K2-K1, L2-L1,
+     $                                        A( K1, L1 ), LDA, WNRM )
+            ELSE
+               SWORK( L, AWRK + K ) = CLANGE( '1', K2-K1, L2-L1,
+     $                                        A( K1, L1 ), LDA, WNRM )
+            END IF
+         END DO
+      END DO
+      BWRK = NBB + NBA
+      DO K = 1, NBB
+         K1 = (K - 1) * NB + 1
+         K2 = MIN( K * NB, N ) + 1
+         DO L = K, NBB
+            L1 = (L - 1) * NB + 1
+            L2 = MIN( L * NB, N ) + 1
+            IF( NOTRNB ) THEN
+               SWORK( K, BWRK + L ) = CLANGE( 'I', K2-K1, L2-L1,
+     $                                        B( K1, L1 ), LDB, WNRM )
+            ELSE
+               SWORK( L, BWRK + K ) = CLANGE( '1', K2-K1, L2-L1,
+     $                                        B( K1, L1 ), LDB, WNRM )
+            END IF
+         END DO
+      END DO
+*
+      SGN = REAL( ISGN )
+      CSGN = CMPLX( SGN, ZERO )
+*
+      IF( NOTRNA .AND. NOTRNB ) THEN
+*
+*        Solve    A*X + ISGN*X*B = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        bottom-left corner column by column by
+*
+*         A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L)
+*
+*        Where
+*                  M                         L-1
+*        R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)].
+*                I=K+1                       J=1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = NBA, 1, -1
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = (K - 1) * NB + 1
+            K2 = MIN( K * NB, M ) + 1
+            DO L = 1, NBB
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = (L - 1) * NB + 1
+               L2 = MIN( L * NB, N ) + 1
+*
+               CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K - 1, 1, -1
+*
+*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L )
+*
+                  I1 = (I - 1) * NB + 1
+                  I2 = MIN( I * NB, M ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = SLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL.NE.ONE ) THEN
+                      DO JJ = L1, L2-1
+                         CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1)
+                      END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF( SCAL.NE.ONE ) THEN
+                      DO LL = L1, L2-1
+                         CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1)
+                      END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE,
+     $                        A( I1, K1 ), LDA, C( K1, L1 ), LDC,
+     $                        CONE, C( I1, L1 ), LDC )
+*
+               END DO
+*
+               DO J = L + 1, NBB
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J )
+*
+                  J1 = (J - 1) * NB + 1
+                  J2 = MIN( J * NB, N ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK(L, BWRK + J)
+                  SCALOC = SLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                      DO JJ = J1, J2-1
+                         CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                      END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN,
+     $                        C( K1, L1 ), LDC, B( L1, J1 ), LDB,
+     $                        CONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN
+*
+*        Solve    A**H *X + ISGN*X*B = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        upper-left corner column by column by
+*
+*          A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L)
+*
+*        Where
+*                   K-1                        L-1
+*          R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)]
+*                   I=1                        J=1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = 1, NBA
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = (K - 1) * NB + 1
+            K2 = MIN( K * NB, M ) + 1
+            DO L = 1, NBB
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = (L - 1) * NB + 1
+               L2 = MIN( L * NB, N ) + 1
+*
+               CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K + 1, NBA
+*
+*                 C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L )
+*
+                  I1 = (I - 1) * NB + 1
+                  I2 = MIN( I * NB, M ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = SLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to to C( I, L ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE,
+     $                        A( K1, I1 ), LDA, C( K1, L1 ), LDC,
+     $                        CONE, C( I1, L1 ), LDC )
+               END DO
+*
+               DO J = L + 1, NBB
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J )
+*
+                  J1 = (J - 1) * NB + 1
+                  J2 = MIN( J * NB, N ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = SLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to to C( K, J ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                      DO LL = L1, L2-1
+                         CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                      END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL CGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN,
+     $                        C( K1, L1 ), LDC, B( L1, J1 ), LDB,
+     $                        CONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN
+*
+*        Solve    A**H *X + ISGN*X*B**H = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        top-right corner column by column by
+*
+*           A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L)
+*
+*        Where
+*                     K-1                          N
+*            R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H].
+*                     I=1                        J=L+1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = 1, NBA
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = (K - 1) * NB + 1
+            K2 = MIN( K * NB, M ) + 1
+            DO L = NBB, 1, -1
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = (L - 1) * NB + 1
+               L2 = MIN( L * NB, N ) + 1
+*
+               CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K + 1, NBA
+*
+*                 C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L )
+*
+                  I1 = (I - 1) * NB + 1
+                  I2 = MIN( I * NB, M ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = SLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL CGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE,
+     $                        A( K1, I1 ), LDA, C( K1, L1 ), LDC,
+     $                        CONE, C( I1, L1 ), LDC )
+               END DO
+*
+               DO J = 1, L - 1
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H
+*
+                  J1 = (J - 1) * NB + 1
+                  J2 = MIN( J * NB, N ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = SLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1)
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN,
+     $                        C( K1, L1 ), LDC, B( J1, L1 ), LDB,
+     $                        CONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN
+*
+*        Solve    A*X + ISGN*X*B**H = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        bottom-right corner column by column by
+*
+*            A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L)
+*
+*        Where
+*                      M                          N
+*            R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H].
+*                    I=K+1                      J=L+1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = NBA, 1, -1
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = (K - 1) * NB + 1
+            K2 = MIN( K * NB, M ) + 1
+            DO L = NBB, 1, -1
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = (L - 1) * NB + 1
+               L2 = MIN( L * NB, N ) + 1
+*
+               CALL CTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = CLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = 1, K - 1
+*
+*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L )
+*
+                  I1 = (I - 1) * NB + 1
+                  I2 = MIN( I * NB, M ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = CLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = SLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL CSSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL CGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE,
+     $                        A( I1, K1 ), LDA, C( K1, L1 ), LDC,
+     $                        CONE, C( I1, L1 ), LDC )
+*
+               END DO
+*
+               DO J = 1, L - 1
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H
+*
+                  J1 = (J - 1) * NB + 1
+                  J2 = MIN( J * NB, N ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = CLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = SLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC 
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = L1, L2-1
+                        CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL CSSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL CGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN,
+     $                        C( K1, L1 ), LDC, B( J1, L1 ), LDB,
+     $                        CONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+*
+      END IF
+*
+*     Reduce local scaling factors
+*
+      SCALE = SWORK( 1, 1 )
+      DO K = 1, NBA
+         DO L = 1, NBB
+            SCALE = MIN( SCALE, SWORK( K, L ) )
+         END DO
+      END DO
+      IF( SCALE .EQ. ZERO ) THEN
+*
+*        The magnitude of the largest entry of the solution is larger
+*        than the product of BIGNUM**2 and cannot be represented in the
+*        form (1/SCALE)*X if SCALE is REAL. Set SCALE to
+*        zero and give up.
+*
+         SWORK(1,1) = MAX( NBA, NBB )
+         SWORK(2,1) = 2 * NBB + NBA
+         RETURN
+      END IF
+*
+*     Realize consistent scaling
+*
+      DO K = 1, NBA
+         K1 = (K - 1) * NB + 1
+         K2 = MIN( K * NB, M ) + 1
+         DO L = 1, NBB
+            L1 = (L - 1) * NB + 1
+            L2 = MIN( L * NB, N ) + 1
+            SCAL = SCALE / SWORK( K, L )
+            IF( SCAL .NE. ONE ) THEN
+               DO LL = L1, L2-1
+                  CALL CSSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+               END DO
+            ENDIF
+         END DO
+      END DO
+*
+      IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN
+*
+*        Decrease SCALE as much as possible.
+*
+         SCALOC = MIN( SCALE / SMLNUM, ONE / BUF )
+         BUF = BUF * SCALOC
+         SCALE = SCALE / SCALOC
+      END IF
+*
+      IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN
+*
+*        In case of overly aggressive scaling during the computation,
+*        flushing of the global scale factor may be prevented by
+*        undoing some of the scaling. This step is to ensure that
+*        this routine flushes only scale factors that TRSYL also
+*        flushes and be usable as a drop-in replacement.
+*
+*        How much can the normwise largest entry be upscaled?
+*
+         SCAL = MAX( ABS( REAL( C( 1, 1 ) ) ),
+     $               ABS( AIMAG( C ( 1, 1 ) ) ) )
+         DO K = 1, M
+            DO L = 1, N
+               SCAL = MAX( SCAL, ABS( REAL ( C( K, L ) ) ),
+     $                     ABS( AIMAG ( C( K, L ) ) ) )
+            END DO
+         END DO
+*
+*        Increase BUF as close to 1 as possible and apply scaling.
+*
+         SCALOC = MIN( BIGNUM / SCAL, ONE / BUF )
+         BUF = BUF * SCALOC
+         CALL CLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO )
+      END IF
+*
+*     Combine with buffer scaling factor. SCALE will be flushed if
+*     BUF is less than one here.
+*
+      SCALE = SCALE * BUF
+*
+*     Restore workspace dimensions
+*
+      SWORK(1,1) = MAX( NBA, NBB )
+      SWORK(2,1) = 2 * NBB + NBA
+*
+      RETURN
+*
+*     End of CTRSYL3
+*
+      END
diff --git a/lapack-netlib/SRC/dlarmm.f b/lapack-netlib/SRC/dlarmm.f
new file mode 100644
index 000000000..c36042009
--- /dev/null
+++ b/lapack-netlib/SRC/dlarmm.f
@@ -0,0 +1,99 @@
+*> \brief \b DLARMM
+*
+* Definition:
+* ===========
+*
+*      DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM )
+*
+*     .. Scalar Arguments ..
+*      DOUBLE PRECISION   ANORM, BNORM, CNORM
+*     ..
+*
+*>  \par Purpose:
+*  =======
+*>
+*> \verbatim
+*>
+*> DLARMM returns a factor s in (0, 1] such that the linear updates
+*>
+*>    (s * C) - A * (s * B)  and  (s * C) - (s * A) * B
+*>
+*> cannot overflow, where A, B, and C are matrices of conforming
+*> dimensions.
+*>
+*> This is an auxiliary routine so there is no argument checking.
+*> \endverbatim
+*
+*  Arguments:
+*  =========
+*
+*> \param[in] ANORM
+*> \verbatim
+*>          ANORM is DOUBLE PRECISION
+*>          The infinity norm of A. ANORM >= 0.
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] BNORM
+*> \verbatim
+*>          BNORM is DOUBLE PRECISION
+*>          The infinity norm of B. BNORM >= 0.
+*> \endverbatim
+*>
+*> \param[in] CNORM
+*> \verbatim
+*>          CNORM is DOUBLE PRECISION
+*>          The infinity norm of C. CNORM >= 0.
+*> \endverbatim
+*>
+*>
+*  =====================================================================
+*>  References:
+*>    C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for
+*>    Robust Solution of Triangular Linear Systems. In: International
+*>    Conference on Parallel Processing and Applied Mathematics, pages
+*>    68--78. Springer, 2017.
+*>
+*> \ingroup OTHERauxiliary
+*  =====================================================================
+
+      DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM )
+      IMPLICIT NONE
+*     .. Scalar Arguments ..
+      DOUBLE PRECISION   ANORM, BNORM, CNORM
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, HALF, FOUR
+      PARAMETER          ( ONE = 1.0D0, HALF = 0.5D+0, FOUR = 4.0D0 )
+*     ..
+*     .. Local Scalars ..
+       DOUBLE PRECISION   BIGNUM, SMLNUM
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH
+      EXTERNAL           DLAMCH
+*     ..
+*     .. Executable Statements ..
+*
+*
+*     Determine machine dependent parameters to control overflow.
+*
+      SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' )
+      BIGNUM = ( ONE / SMLNUM ) / FOUR
+*
+*     Compute a scale factor.
+*
+      DLARMM = ONE
+      IF( BNORM .LE. ONE ) THEN
+         IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN
+            DLARMM = HALF
+         END IF
+      ELSE
+         IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN
+            DLARMM = HALF / BNORM
+         END IF
+      END IF
+      RETURN
+*
+*     ==== End of DLARMM ====
+*
+      END
diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f
new file mode 100644
index 000000000..b4a98bc78
--- /dev/null
+++ b/lapack-netlib/SRC/dlatrs3.f
@@ -0,0 +1,656 @@
+*> \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
+*                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          DIAG, NORMIN, TRANS, UPLO
+*       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), CNORM( * ), SCALE( * ), 
+*                          WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLATRS3 solves one of the triangular systems
+*>
+*>    A * X = B * diag(scale)  or  A**T * X = B * diag(scale)
+*>
+*> with scaling to prevent overflow.  Here A is an upper or lower
+*> triangular matrix, A**T denotes the transpose of A. X and B are
+*> n by nrhs matrices and scale is an nrhs element vector of scaling
+*> factors. A scaling factor scale(j) is usually less than or equal
+*> to 1, chosen such that X(:,j) is less than the overflow threshold.
+*> If the matrix A is singular (A(j,j) = 0 for some j), then
+*> a non-trivial solution to A*X = 0 is returned. If the system is
+*> so badly scaled that the solution cannot be represented as
+*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
+*>
+*> This is a BLAS-3 version of LATRS for solving several right
+*> hand sides simultaneously.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          Specifies whether the matrix A is upper or lower triangular.
+*>          = 'U':  Upper triangular
+*>          = 'L':  Lower triangular
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the operation applied to A.
+*>          = 'N':  Solve A * x = s*b  (No transpose)
+*>          = 'T':  Solve A**T* x = s*b  (Transpose)
+*>          = 'C':  Solve A**T* x = s*b  (Conjugate transpose = Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIAG
+*> \verbatim
+*>          DIAG is CHARACTER*1
+*>          Specifies whether or not the matrix A is unit triangular.
+*>          = 'N':  Non-unit triangular
+*>          = 'U':  Unit triangular
+*> \endverbatim
+*>
+*> \param[in] NORMIN
+*> \verbatim
+*>          NORMIN is CHARACTER*1
+*>          Specifies whether CNORM has been set or not.
+*>          = 'Y':  CNORM contains the column norms on entry
+*>          = 'N':  CNORM is not set on entry.  On exit, the norms will
+*>                  be computed and stored in CNORM.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of columns of X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          The triangular matrix A.  If UPLO = 'U', the leading n by n
+*>          upper triangular part of the array A contains the upper
+*>          triangular matrix, and the strictly lower triangular part of
+*>          A is not referenced.  If UPLO = 'L', the leading n by n lower
+*>          triangular part of the array A contains the lower triangular
+*>          matrix, and the strictly upper triangular part of A is not
+*>          referenced.  If DIAG = 'U', the diagonal elements of A are
+*>          also not referenced and are assumed to be 1.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max (1,N).
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is DOUBLE PRECISION array, dimension (LDX,NRHS)
+*>          On entry, the right hand side B of the triangular system.
+*>          On exit, X is overwritten by the solution matrix X.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max (1,N).
+*> \endverbatim
+*>
+*> \param[out] SCALE
+*> \verbatim
+*>          SCALE is DOUBLE PRECISION array, dimension (NRHS)
+*>          The scaling factor s(k) is for the triangular system
+*>          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k).
+*>          If SCALE = 0, the matrix A is singular or badly scaled.
+*>          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
+*>          that is an exact or approximate solution to A*x(:,k) = 0
+*>          is returned. If the system so badly scaled that solution
+*>          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
+*>          is returned.
+*> \endverbatim
+*>
+*> \param[in,out] CNORM
+*> \verbatim
+*>          CNORM is DOUBLE PRECISION array, dimension (N)
+*>
+*>          If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
+*>          contains the norm of the off-diagonal part of the j-th column
+*>          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal
+*>          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
+*>          must be greater than or equal to the 1-norm.
+*>
+*>          If NORMIN = 'N', CNORM is an output argument and CNORM(j)
+*>          returns the 1-norm of the offdiagonal part of the j-th column
+*>          of A.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (LWORK).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal size of
+*>          WORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*>          LWORK is INTEGER
+*>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
+*>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal dimensions of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -k, the k-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleOTHERauxiliary
+*> \par Further Details:
+*  =====================
+*  \verbatim
+*  The algorithm follows the structure of a block triangular solve.
+*  The diagonal block is solved with a call to the robust the triangular
+*  solver LATRS for every right-hand side RHS = 1, ..., NRHS
+*     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
+*  where op( A ) = A or op( A ) = A**T.
+*  The linear block updates operate on block columns of X,
+*     B( I, K ) - op(A( I, J )) * X( J, K )
+*  and use GEMM. To avoid overflow in the linear block update, the worst case
+*  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
+*  such that
+*     || s * B( I, RHS )||_oo
+*   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold
+*
+*  Once all columns of a block column have been rescaled (BLAS-1), the linear
+*  update is executed with GEMM without overflow.
+*
+*  To limit rescaling, local scale factors track the scaling of column segments.
+*  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
+*  per right-hand side column RHS = 1, ..., NRHS. The global scale factor
+*  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
+*  I = 1, ..., NBA.
+*  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
+*  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
+*  linear update of potentially inconsistently scaled vector segments
+*     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
+*  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
+*  if necessary, rescales the blocks prior to calling GEMM.
+*
+*  \endverbatim
+*  =====================================================================
+*  References:
+*  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
+*  Parallel robust solution of triangular linear systems. Concurrency
+*  and Computation: Practice and Experience, 31(19), e5064.
+*
+*  Contributor:
+*   Angelika Schwarz, Umea University, Sweden.
+*
+*  =====================================================================
+      SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
+     $                    X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
+      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIAG, TRANS, NORMIN, UPLO
+      INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), CNORM( * ), X( LDX, * ),
+     $                   SCALE( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      INTEGER            NBMAX, NBMIN, NBRHS, NRHSMIN
+      PARAMETER          ( NRHSMIN = 2, NBRHS = 32 )
+      PARAMETER          ( NBMIN = 8, NBMAX = 64 )
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   W( NBMAX ), XNRM( NBRHS )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
+      INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
+     $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+      DOUBLE PRECISION   ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
+     $                   SCAMIN, SMLNUM, TMAX
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, DLANGE, DLARMM
+      EXTERNAL           DLAMCH, DLANGE, DLARMM, ILAENV, LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLATRS, DSCAL, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      UPPER = LSAME( UPLO, 'U' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      NOUNIT = LSAME( DIAG, 'N' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+*     Partition A and X into blocks
+*
+      NB = MAX( 8, ILAENV( 1, 'DLATRS', '', N, N, -1, -1 ) )
+      NB = MIN( NBMAX, NB )
+      NBA = MAX( 1, (N + NB - 1) / NB )
+      NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
+*
+*     Compute the workspace
+*
+*     The workspace comprises two parts.
+*     The first part stores the local scale factors. Each simultaneously
+*     computed right-hand side requires one local scale factor per block
+*     row. WORK( I+KK*LDS ) is the scale factor of the vector
+*     segment associated with the I-th block row and the KK-th vector
+*     in the block column.
+      LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
+      LDS = NBA
+*     The second part stores upper bounds of the triangular A. There are
+*     a total of NBA x NBA blocks, of which only the upper triangular
+*     part or the lower triangular part is referenced. The upper bound of
+*     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+      LANRM = NBA * NBA
+      AWRK = LSCALE
+      WORK( 1 ) = LSCALE + LANRM
+*
+*     Test the input parameters
+*
+      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
+         INFO = -3
+      ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
+     $         LSAME( NORMIN, 'N' ) ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+         INFO = -10
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+         INFO = -14
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DLATRS3', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Initialize scaling factors
+*
+      DO KK = 1, NRHS
+         SCALE( KK ) = ONE
+      END DO
+*
+*     Quick return if possible
+*
+      IF( MIN( N, NRHS ).EQ.0 )
+     $   RETURN
+*
+*     Determine machine dependent constant to control overflow.
+*
+      BIGNUM = DLAMCH( 'Overflow' )
+      SMLNUM = DLAMCH( 'Safe Minimum' )
+*
+*     Use unblocked code for small problems
+*
+      IF( NRHS.LT.NRHSMIN ) THEN
+         CALL DLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1),
+     $                SCALE( 1 ), CNORM, INFO )
+         DO K = 2, NRHS
+            CALL DLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
+     $                   SCALE( K ), CNORM, INFO )
+         END DO
+         RETURN
+      END IF
+*
+*     Compute norms of blocks of A excluding diagonal blocks and find
+*     the block with the largest norm TMAX.
+*
+      TMAX = ZERO
+      DO J = 1, NBA
+         J1 = (J-1)*NB + 1
+         J2 = MIN( J*NB, N ) + 1
+         IF ( UPPER ) THEN
+            IFIRST = 1
+            ILAST = J - 1
+         ELSE
+            IFIRST = J + 1
+            ILAST = NBA
+         END IF
+         DO I = IFIRST, ILAST
+            I1 = (I-1)*NB + 1
+            I2 = MIN( I*NB, N ) + 1
+*
+*           Compute upper bound of A( I1:I2-1, J1:J2-1 ).
+*
+            IF( NOTRAN ) THEN
+               ANRM = DLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
+               WORK( AWRK + I+(J-1)*NBA ) = ANRM
+            ELSE
+               ANRM = DLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
+               WORK( AWRK + J+(I-1)*NBA ) = ANRM
+            END IF
+            TMAX = MAX( TMAX, ANRM )
+         END DO
+      END DO
+*
+      IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN
+*
+*        Some matrix entries have huge absolute value. At least one upper
+*        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
+*        number, either due to overflow in LANGE or due to Inf in A.
+*        Fall back to LATRS. Set normin = 'N' for every right-hand side to
+*        force computation of TSCAL in LATRS to avoid the likely overflow
+*        in the computation of the column norms CNORM.
+*
+         DO K = 1, NRHS
+            CALL DLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
+     $                   SCALE( K ), CNORM, INFO )
+         END DO
+         RETURN
+      END IF
+*
+*     Every right-hand side requires workspace to store NBA local scale
+*     factors. To save workspace, X is computed successively in block columns
+*     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
+*     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
+      DO K = 1, NBX
+*        Loop over block columns (index = K) of X and, for column-wise scalings,
+*        over individual columns (index = KK).
+*        K1: column index of the first column in X( J, K )
+*        K2: column index of the first column in X( J, K+1 )
+*        so the K2 - K1 is the column count of the block X( J, K )
+         K1 = (K-1)*NBRHS + 1
+         K2 = MIN( K*NBRHS, NRHS ) + 1
+*
+*        Initialize local scaling factors of current block column X( J, K )
+*
+         DO KK = 1, K2-K1
+            DO I = 1, NBA
+               WORK( I+KK*LDS ) = ONE
+            END DO
+         END DO
+*
+         IF( NOTRAN ) THEN
+*
+*           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
+*
+            IF( UPPER ) THEN
+               JFIRST = NBA
+               JLAST = 1
+               JINC = -1
+            ELSE
+               JFIRST = 1
+               JLAST = NBA
+               JINC = 1
+            END IF
+         ELSE
+*
+*           Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
+*
+            IF( UPPER ) THEN
+               JFIRST = 1
+               JLAST = NBA
+               JINC = 1
+            ELSE
+               JFIRST = NBA
+               JLAST = 1
+               JINC = -1
+            END IF
+         END IF
+*
+         DO J = JFIRST, JLAST, JINC
+*           J1: row index of the first row in A( J, J )
+*           J2: row index of the first row in A( J+1, J+1 )
+*           so that J2 - J1 is the row count of the block A( J, J )
+            J1 = (J-1)*NB + 1
+            J2 = MIN( J*NB, N ) + 1
+*
+*           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
+*           for all right-hand sides in the current block column,
+*           one RHS at a time.
+*
+            DO KK = 1, K2-K1
+               RHS = K1 + KK - 1
+               IF( KK.EQ.1 ) THEN
+                  CALL DLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
+     $                         A( J1, J1 ), LDA, X( J1, RHS ),
+     $                         SCALOC, CNORM, INFO )
+               ELSE
+                  CALL DLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
+     $                         A( J1, J1 ), LDA, X( J1, RHS ),
+     $                         SCALOC, CNORM, INFO )
+               END IF
+*              Find largest absolute value entry in the vector segment
+*              X( J1:J2-1, RHS ) as an upper bound for the worst case
+*              growth in the linear updates.
+               XNRM( KK ) = DLANGE( 'I', J2-J1, 1, X( J1, RHS ),
+     $                              LDX, W )
+*
+               IF( SCALOC .EQ. ZERO ) THEN
+*                 LATRS found that A is singular through A(j,j) = 0.
+*                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
+*                 and compute A*x = 0 (or A**T*x = 0). Note that
+*                 X(J1:J2-1, KK) is set by LATRS.
+                  SCALE( RHS ) = ZERO
+                  DO II = 1, J1-1
+                     X( II, KK ) = ZERO
+                  END DO
+                  DO II = J2, N
+                     X( II, KK ) = ZERO
+                  END DO
+*                 Discard the local scale factors.
+                  DO II = 1, NBA
+                     WORK( II+KK*LDS ) = ONE
+                  END DO
+                  SCALOC = ONE
+               ELSE IF( SCALOC * WORK( J+KK*LDS ) .EQ. ZERO ) THEN
+*                 LATRS computed a valid scale factor, but combined with
+*                 the current scaling the solution does not have a
+*                 scale factor > 0.
+*
+*                 Set WORK( J+KK*LDS ) to smallest valid scale
+*                 factor and increase SCALOC accordingly.
+                  SCAL = WORK( J+KK*LDS ) / SMLNUM
+                  SCALOC = SCALOC * SCAL
+                  WORK( J+KK*LDS ) = SMLNUM
+*                 If LATRS overestimated the growth, x may be
+*                 rescaled to preserve a valid combined scale
+*                 factor WORK( J, KK ) > 0.
+                  RSCAL = ONE / SCALOC
+                  IF( XNRM( KK ) * RSCAL .LE. BIGNUM ) THEN
+                     XNRM( KK ) = XNRM( KK ) * RSCAL
+                     CALL DSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
+                     SCALOC = ONE
+                  ELSE
+*                    The system op(A) * x = b is badly scaled and its
+*                    solution cannot be represented as (1/scale) * x.
+*                    Set x to zero. This approach deviates from LATRS
+*                    where a completely meaningless non-zero vector
+*                    is returned that is not a solution to op(A) * x = b.
+                     SCALE( RHS ) = ZERO
+                     DO II = 1, N
+                        X( II, KK ) = ZERO
+                     END DO
+*                    Discard the local scale factors.
+                     DO II = 1, NBA
+                        WORK( II+KK*LDS ) = ONE
+                     END DO
+                     SCALOC = ONE
+                  END IF
+               END IF
+               SCALOC = SCALOC * WORK( J+KK*LDS )
+               WORK( J+KK*LDS ) = SCALOC
+            END DO
+*
+*           Linear block updates
+*
+            IF( NOTRAN ) THEN
+               IF( UPPER ) THEN
+                  IFIRST = J - 1
+                  ILAST = 1
+                  IINC = -1
+               ELSE
+                  IFIRST = J + 1
+                  ILAST = NBA
+                  IINC = 1
+               END IF
+            ELSE
+               IF( UPPER ) THEN
+                  IFIRST = J + 1
+                  ILAST = NBA
+                  IINC = 1
+               ELSE
+                  IFIRST = J - 1
+                  ILAST = 1
+                  IINC = -1
+               END IF
+            END IF
+*
+            DO I = IFIRST, ILAST, IINC
+*              I1: row index of the first column in X( I, K )
+*              I2: row index of the first column in X( I+1, K )
+*              so the I2 - I1 is the row count of the block X( I, K )
+               I1 = (I-1)*NB + 1
+               I2 = MIN( I*NB, N ) + 1
+*
+*              Prepare the linear update to be executed with GEMM.
+*              For each column, compute a consistent scaling, a
+*              scaling factor to survive the linear update, and
+*              rescale the column segments, if necesssary. Then
+*              the linear update is safely executed.
+*
+               DO KK = 1, K2-K1
+                  RHS = K1 + KK - 1
+*                 Compute consistent scaling
+                  SCAMIN = MIN( WORK( I + KK*LDS), WORK( J + KK*LDS ) )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  BNRM = DLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
+                  BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
+                  XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS ))
+                  ANRM = WORK( AWRK + I+(J-1)*NBA )
+                  SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM )
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to B( I, KK ) and B( J, KK ).
+*
+                  SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
+                  IF( SCAL.NE.ONE ) THEN
+                     CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
+                     WORK( I+KK*LDS ) = SCAMIN*SCALOC
+                  END IF
+*
+                  SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
+                  IF( SCAL.NE.ONE ) THEN
+                     CALL DSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
+                     WORK( J+KK*LDS ) = SCAMIN*SCALOC
+                  END IF
+               END DO
+*
+               IF( NOTRAN ) THEN
+*
+*                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
+*
+                  CALL DGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE,
+     $                        A( I1, J1 ), LDA, X( J1, K1 ), LDX,
+     $                        ONE, X( I1, K1 ), LDX )
+               ELSE
+*
+*                 B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K )
+*
+                  CALL DGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE,
+     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
+     $                        ONE, X( I1, K1 ), LDX )
+               END IF
+            END DO
+         END DO
+*
+*        Reduce local scaling factors
+*
+         DO KK = 1, K2-K1
+            RHS = K1 + KK - 1
+            DO I = 1, NBA
+               SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
+            END DO
+         END DO
+*
+*        Realize consistent scaling
+*
+         DO KK = 1, K2-K1
+            RHS = K1 + KK - 1
+            IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
+               DO I = 1, NBA
+                  I1 = (I-1)*NB + 1
+                  I2 = MIN( I*NB, N ) + 1
+                  SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
+                  IF( SCAL.NE.ONE )
+     $               CALL DSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
+               END DO
+            END IF
+         END DO
+      END DO
+      RETURN
+*
+*     End of DLATRS3
+*
+      END
diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f
new file mode 100644
index 000000000..c44ec3808
--- /dev/null
+++ b/lapack-netlib/SRC/dtrsyl3.f
@@ -0,0 +1,1241 @@
+*> \brief \b DTRSYL3
+*
+* Definition:
+* ===========
+*
+*
+*>  \par Purpose
+*  =============
+*>
+*> \verbatim
+*>
+*>  DTRSYL3 solves the real Sylvester matrix equation:
+*>
+*>     op(A)*X + X*op(B) = scale*C or
+*>     op(A)*X - X*op(B) = scale*C,
+*>
+*>  where op(A) = A or A**T, and  A and B are both upper quasi-
+*>  triangular. A is M-by-M and B is N-by-N; the right hand side C and
+*>  the solution X are M-by-N; and scale is an output scale factor, set
+*>  <= 1 to avoid overflow in X.
+*>
+*>  A and B must be in Schur canonical form (as returned by DHSEQR), that
+*>  is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks;
+*>  each 2-by-2 diagonal block has its diagonal elements equal and its
+*>  off-diagonal elements of opposite sign.
+*>
+*>  This is the block version of the algorithm.
+*> \endverbatim
+*
+*  Arguments
+*  =========
+*
+*> \param[in] TRANA
+*> \verbatim
+*>          TRANA is CHARACTER*1
+*>          Specifies the option op(A):
+*>          = 'N': op(A) = A    (No transpose)
+*>          = 'T': op(A) = A**T (Transpose)
+*>          = 'C': op(A) = A**H (Conjugate transpose = Transpose)
+*> \endverbatim
+*>
+*> \param[in] TRANB
+*> \verbatim
+*>          TRANB is CHARACTER*1
+*>          Specifies the option op(B):
+*>          = 'N': op(B) = B    (No transpose)
+*>          = 'T': op(B) = B**T (Transpose)
+*>          = 'C': op(B) = B**H (Conjugate transpose = Transpose)
+*> \endverbatim
+*>
+*> \param[in] ISGN
+*> \verbatim
+*>          ISGN is INTEGER
+*>          Specifies the sign in the equation:
+*>          = +1: solve op(A)*X + X*op(B) = scale*C
+*>          = -1: solve op(A)*X - X*op(B) = scale*C
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The order of the matrix A, and the number of rows in the
+*>          matrices X and C. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix B, and the number of columns in the
+*>          matrices X and C. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,M)
+*>          The upper quasi-triangular matrix A, in Schur canonical form.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,N)
+*>          The upper quasi-triangular matrix B, in Schur canonical form.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (LDC,N)
+*>          On entry, the M-by-N right hand side matrix C.
+*>          On exit, C is overwritten by the solution matrix X.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M)
+*> \endverbatim
+*>
+*> \param[out] SCALE
+*> \verbatim
+*>          SCALE is DOUBLE PRECISION
+*>          The scale factor, scale, set <= 1 to avoid overflow in X.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          IWORK is INTEGER
+*>          The dimension of the array IWORK. LIWORK >=  ((M + NB - 1) / NB + 1)
+*>          + ((N + NB - 1) / NB + 1), where NB is the optimal block size.
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal dimension of the IWORK array,
+*>          returns this value as the first entry of the IWORK array, and
+*>          no error message related to LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] SWORK
+*> \verbatim
+*>          SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS),
+*>          MAX(1,COLS)).
+*>          On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS
+*>          and SWORK(2) returns the optimal COLS.
+*> \endverbatim
+*>
+*> \param[in] LDSWORK
+*> \verbatim
+*>          LDSWORK is INTEGER
+*>          LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1)
+*>          and NB is the optimal block size.
+*>
+*>          If LDSWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal dimensions of the SWORK matrix,
+*>          returns these values as the first and second entry of the SWORK
+*>          matrix, and no error message related LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0: successful exit
+*>          < 0: if INFO = -i, the i-th argument had an illegal value
+*>          = 1: A and B have common or very close eigenvalues; perturbed
+*>               values were used to solve the equation (but the matrices
+*>               A and B are unchanged).
+*> \endverbatim
+*
+*  =====================================================================
+*  References:
+*   E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of
+*   algorithms: The triangular Sylvester equation, ACM Transactions
+*   on Mathematical Software (TOMS), volume 29, pages 218--243.
+*
+*   A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel
+*   Solution of the Triangular Sylvester Equation. Lecture Notes in
+*   Computer Science, vol 12043, pages 82--92, Springer.
+*
+*  Contributor:
+*   Angelika Schwarz, Umea University, Sweden.
+*
+*  =====================================================================
+      SUBROUTINE DTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C,
+     $                    LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK,
+     $                    INFO )
+      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+      CHARACTER          TRANA, TRANB
+      INTEGER            INFO, ISGN, LDA, LDB, LDC, M, N,
+     $                   LIWORK, LDSWORK
+      DOUBLE PRECISION   SCALE
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), C( LDC, * ),
+     $                   SWORK( LDSWORK, * )
+*     ..
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            NOTRNA, NOTRNB, LQUERY, SKIP
+      INTEGER            AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ,
+     $                   K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC
+      DOUBLE PRECISION   ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC,
+     $                   SCAMIN, SGN, XNRM, BUF, SMLNUM
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   WNRM( MAX( M, N ) )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLANGE, DLAMCH, DLARMM
+      EXTERNAL           DLANGE, DLAMCH, DLARMM, ILAENV, LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGEMM, DLASCL, DSCAL, DTRSYL, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, EXPONENT, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Decode and Test input parameters
+*
+      NOTRNA = LSAME( TRANA, 'N' )
+      NOTRNB = LSAME( TRANB, 'N' )
+*
+*     Use the same block size for all matrices.
+*
+      NB = MAX(8, ILAENV( 1, 'DTRSYL', '', M, N, -1, -1) )
+*
+*     Compute number of blocks in A and B
+*
+      NBA = MAX( 1, (M + NB - 1) / NB )
+      NBB = MAX( 1, (N + NB - 1) / NB )
+*
+*     Compute workspace
+*
+      INFO = 0
+      LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 )
+      IWORK( 1 ) = NBA + NBB + 2
+      IF( LQUERY ) THEN
+         LDSWORK = 2
+         SWORK( 1, 1 ) = MAX( NBA, NBB )
+         SWORK( 2, 1 ) = 2 * NBB + NBA
+      END IF
+*
+*     Test the input arguments
+*
+      IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT.
+     $    LSAME( TRANA, 'C' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT.
+     $         LSAME( TRANB, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN
+         INFO = -3
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+         INFO = -9
+      ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
+         INFO = -11
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DTRSYL3', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      SCALE = ONE
+      IF( M.EQ.0 .OR. N.EQ.0 )
+     $   RETURN
+*
+*     Use unblocked code for small problems or if insufficient
+*     workspaces are provided
+*
+      IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR.
+     $    LIWORK.LT.IWORK(1) ) THEN
+        CALL DTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB,
+     $               C, LDC, SCALE, INFO )
+        RETURN
+      END IF
+*
+*     Set constants to control overflow
+*
+      SMLNUM = DLAMCH( 'S' )
+      BIGNUM = ONE / SMLNUM
+*
+*      Partition A such that 2-by-2 blocks on the diagonal are not split
+*
+       SKIP = .FALSE.
+       DO I = 1, NBA
+          IWORK( I ) = ( I - 1 ) * NB + 1
+       END DO
+       IWORK( NBA + 1 ) = M + 1
+       DO K = 1, NBA
+          L1 = IWORK( K )
+          L2 = IWORK( K + 1 ) - 1
+          DO L = L1, L2
+             IF( SKIP ) THEN
+                SKIP = .FALSE.
+                CYCLE
+             END IF
+             IF( L.GE.M ) THEN
+*               A( M, M ) is a 1-by-1 block
+                CYCLE
+             END IF
+             IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN
+*               Check if 2-by-2 block is split
+                IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN
+                   IWORK( K + 1 ) = IWORK( K + 1 ) + 1
+                   CYCLE
+                END IF
+                SKIP = .TRUE.
+             END IF
+          END DO
+       END DO
+       IWORK( NBA + 1 ) = M + 1
+       IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN
+          IWORK( NBA ) = IWORK( NBA + 1 )
+          NBA = NBA - 1
+       END IF
+*
+*      Partition B such that 2-by-2 blocks on the diagonal are not split
+*
+       PC = NBA + 1
+       SKIP = .FALSE.
+       DO I = 1, NBB
+          IWORK( PC + I ) = ( I - 1 ) * NB + 1
+       END DO
+       IWORK( PC + NBB + 1 ) = N + 1
+       DO K = 1, NBB
+          L1 = IWORK( PC + K )
+          L2 = IWORK( PC + K + 1 ) - 1
+          DO L = L1, L2
+             IF( SKIP ) THEN
+                SKIP = .FALSE.
+                CYCLE
+             END IF
+             IF( L.GE.N ) THEN
+*               B( N, N ) is a 1-by-1 block
+                CYCLE
+             END IF
+             IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN
+*               Check if 2-by-2 block is split
+                IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN
+                   IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1
+                   CYCLE
+                END IF
+                SKIP = .TRUE.
+             END IF
+          END DO
+       END DO
+       IWORK( PC + NBB + 1 ) = N + 1
+       IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN
+          IWORK( PC + NBB ) = IWORK( PC + NBB + 1 )
+          NBB = NBB - 1
+       END IF
+*
+*     Set local scaling factors - must never attain zero.
+*
+      DO L = 1, NBB
+         DO K = 1, NBA
+            SWORK( K, L ) = ONE
+         END DO
+      END DO
+*
+*     Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero.
+*     This scaling is to ensure compatibility with TRSYL and may get flushed.
+*
+      BUF = ONE
+*
+*     Compute upper bounds of blocks of A and B
+*
+      AWRK = NBB
+      DO K = 1, NBA
+         K1 = IWORK( K )
+         K2 = IWORK( K + 1 )
+         DO L = K, NBA
+            L1 = IWORK( L )
+            L2 = IWORK( L + 1 )
+            IF( NOTRNA ) THEN
+               SWORK( K, AWRK + L ) = DLANGE( 'I', K2-K1, L2-L1,
+     $                                        A( K1, L1 ), LDA, WNRM )
+            ELSE
+               SWORK( L, AWRK + K ) = DLANGE( '1', K2-K1, L2-L1,
+     $                                        A( K1, L1 ), LDA, WNRM )
+            END IF
+         END DO
+      END DO
+      BWRK = NBB + NBA
+      DO K = 1, NBB
+         K1 = IWORK( PC + K )
+         K2 = IWORK( PC + K + 1 )
+         DO L = K, NBB
+            L1 = IWORK( PC + L )
+            L2 = IWORK( PC + L + 1 )
+            IF( NOTRNB ) THEN
+               SWORK( K, BWRK + L ) = DLANGE( 'I', K2-K1, L2-L1,
+     $                                        B( K1, L1 ), LDB, WNRM )
+            ELSE
+               SWORK( L, BWRK + K ) = DLANGE( '1', K2-K1, L2-L1,
+     $                                        B( K1, L1 ), LDB, WNRM )
+            END IF
+         END DO
+      END DO
+*
+      SGN = DBLE( ISGN )
+*
+      IF( NOTRNA .AND. NOTRNB ) THEN
+*
+*        Solve    A*X + ISGN*X*B = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        bottom-left corner column by column by
+*
+*         A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L)
+*
+*        Where
+*                  M                         L-1
+*        R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)].
+*                I=K+1                       J=1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = NBA, 1, -1
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = IWORK( K )
+            K2 = IWORK( K + 1 )
+            DO L = 1, NBB
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = IWORK( PC + L )
+               L2 = IWORK( PC + L + 1 )
+*
+               CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K - 1, 1, -1
+*
+*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L )
+*
+                  I1 = IWORK( I )
+                  I2 = IWORK( I + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = DLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                      DO JJ = L1, L2-1
+                         CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1)
+                      END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                      DO LL = L1, L2-1
+                         CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1)
+                      END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE,
+     $                        A( I1, K1 ), LDA, C( K1, L1 ), LDC,
+     $                        ONE, C( I1, L1 ), LDC )
+*
+               END DO
+*
+               DO J = L + 1, NBB
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J )
+*
+                  J1 = IWORK( PC + J )
+                  J2 = IWORK( PC + J + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK(L, BWRK + J)
+                  SCALOC = DLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                      DO JJ = J1, J2-1
+                         CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                      END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN,
+     $                        C( K1, L1 ), LDC, B( L1, J1 ), LDB,
+     $                        ONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN
+*
+*        Solve    A**T*X + ISGN*X*B = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        upper-left corner column by column by
+*
+*          A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L)
+*
+*        Where
+*                   K-1                        L-1
+*          R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)]
+*                   I=1                        J=1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = 1, NBA
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = IWORK( K )
+            K2 = IWORK( K + 1 )
+            DO L = 1, NBB
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = IWORK( PC + L )
+               L2 = IWORK( PC + L + 1 )
+*
+               CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K + 1, NBA
+*
+*                 C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L )
+*
+                  I1 = IWORK( I )
+                  I2 = IWORK( I + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = DLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to to C( I, L ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE,
+     $                        A( K1, I1 ), LDA, C( K1, L1 ), LDC,
+     $                        ONE, C( I1, L1 ), LDC )
+               END DO
+*
+               DO J = L + 1, NBB
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J )
+*
+                  J1 = IWORK( PC + J )
+                  J2 = IWORK( PC + J + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = DLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to to C( K, J ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                      DO LL = L1, L2-1
+                         CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                      END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL DGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN,
+     $                        C( K1, L1 ), LDC, B( L1, J1 ), LDB,
+     $                        ONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN
+*
+*        Solve    A**T*X + ISGN*X*B**T = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        top-right corner column by column by
+*
+*           A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L)
+*
+*        Where
+*                     K-1                          N
+*            R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T].
+*                     I=1                        J=L+1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = 1, NBA
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = IWORK( K )
+            K2 = IWORK( K + 1 )
+            DO L = NBB, 1, -1
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = IWORK( PC + L )
+               L2 = IWORK( PC + L + 1 )
+*
+               CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K + 1, NBA
+*
+*                 C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L )
+*
+                  I1 = IWORK( I )
+                  I2 = IWORK( I + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = DLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL DGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE,
+     $                        A( K1, I1 ), LDA, C( K1, L1 ), LDC,
+     $                        ONE, C( I1, L1 ), LDC )
+               END DO
+*
+               DO J = 1, L - 1
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T
+*
+                  J1 = IWORK( PC + J )
+                  J2 = IWORK( PC + J + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = DLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1)
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN,
+     $                        C( K1, L1 ), LDC, B( J1, L1 ), LDB,
+     $                        ONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN
+*
+*        Solve    A*X + ISGN*X*B**T = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        bottom-right corner column by column by
+*
+*            A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L)
+*
+*        Where
+*                      M                          N
+*            R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T].
+*                    I=K+1                      J=L+1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = NBA, 1, -1
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = IWORK( K )
+            K2 = IWORK( K + 1 )
+            DO L = NBB, 1, -1
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = IWORK( PC + L )
+               L2 = IWORK( PC + L + 1 )
+*
+               CALL DTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = DLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = 1, K - 1
+*
+*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L )
+*
+                  I1 = IWORK( I )
+                  I2 = IWORK( I + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = DLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = DLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL DSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL DGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE,
+     $                        A( I1, K1 ), LDA, C( K1, L1 ), LDC,
+     $                        ONE, C( I1, L1 ), LDC )
+*
+               END DO
+*
+               DO J = 1, L - 1
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T
+*
+                  J1 = IWORK( PC + J )
+                  J2 = IWORK( PC + J + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = DLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = DLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = L1, L2-1
+                        CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL DSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL DGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN,
+     $                        C( K1, L1 ), LDC, B( J1, L1 ), LDB,
+     $                        ONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+*
+      END IF
+*
+*     Reduce local scaling factors
+*
+      SCALE = SWORK( 1, 1 )
+      DO K = 1, NBA
+         DO L = 1, NBB
+            SCALE = MIN( SCALE, SWORK( K, L ) )
+         END DO
+      END DO
+*
+      IF( SCALE .EQ. ZERO ) THEN
+*
+*        The magnitude of the largest entry of the solution is larger
+*        than the product of BIGNUM**2 and cannot be represented in the
+*        form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to
+*        zero and give up.
+*
+         IWORK(1) = NBA + NBB + 2
+         SWORK(1,1) = MAX( NBA, NBB )
+         SWORK(2,1) = 2 * NBB + NBA
+         RETURN
+      END IF
+*
+*     Realize consistent scaling
+*
+      DO K = 1, NBA
+         K1 = IWORK( K )
+         K2 = IWORK( K + 1 )
+         DO L = 1, NBB
+            L1 = IWORK( PC + L )
+            L2 = IWORK( PC + L + 1 )
+            SCAL = SCALE / SWORK( K, L )
+            IF( SCAL .NE. ONE ) THEN
+               DO LL = L1, L2-1
+                  CALL DSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+               END DO
+            ENDIF
+         END DO
+      END DO
+*
+      IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN
+*
+*        Decrease SCALE as much as possible.
+*
+         SCALOC = MIN( SCALE / SMLNUM, ONE / BUF )
+         BUF = BUF * SCALOC
+         SCALE = SCALE / SCALOC
+      END IF
+
+      IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN
+*
+*        In case of overly aggressive scaling during the computation,
+*        flushing of the global scale factor may be prevented by
+*        undoing some of the scaling. This step is to ensure that
+*        this routine flushes only scale factors that TRSYL also
+*        flushes and be usable as a drop-in replacement.
+*
+*        How much can the normwise largest entry be upscaled?
+*
+         SCAL = C( 1, 1 )
+         DO K = 1, M
+            DO L = 1, N
+               SCAL = MAX( SCAL, ABS( C( K, L ) ) )
+            END DO
+         END DO
+*
+*        Increase BUF as close to 1 as possible and apply scaling.
+*
+         SCALOC = MIN( BIGNUM / SCAL, ONE / BUF )
+         BUF = BUF * SCALOC
+         CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK )
+      END IF
+*
+*     Combine with buffer scaling factor. SCALE will be flushed if
+*     BUF is less than one here.
+*
+      SCALE = SCALE * BUF
+*
+*     Restore workspace dimensions
+*
+      IWORK(1) = NBA + NBB + 2
+      SWORK(1,1) = MAX( NBA, NBB )
+      SWORK(2,1) = 2 * NBB + NBA
+*
+      RETURN
+*
+*     End of DTRSYL3
+*
+      END
diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f
index af2850398..a639e0375 100644
--- a/lapack-netlib/SRC/ilaenv.f
+++ b/lapack-netlib/SRC/ilaenv.f
@@ -469,6 +469,15 @@
             ELSE
                NB = 64
             END IF
+         ELSE IF( C3.EQ.'SYL' ) THEN
+*           The upper bound is to prevent overly aggressive scaling.
+            IF( SNAME ) THEN
+               NB = MIN( MAX( 48, INT( ( MIN( N1, N2 ) * 16 ) / 100) ),
+     $                   240 )
+            ELSE
+               NB = MIN( MAX( 24, INT( ( MIN( N1, N2 ) * 8 ) / 100) ),
+     $                   80 )
+            END IF
          END IF
       ELSE IF( C2.EQ.'LA' ) THEN
          IF( C3.EQ.'UUM' ) THEN
@@ -477,6 +486,12 @@
             ELSE
                NB = 64
             END IF
+         ELSE IF( C3.EQ.'TRS' ) THEN
+            IF( SNAME ) THEN
+               NB = 32
+            ELSE
+               NB = 32
+            END IF
          END IF
       ELSE IF( SNAME .AND. C2.EQ.'ST' ) THEN
          IF( C3.EQ.'EBZ' ) THEN
diff --git a/lapack-netlib/SRC/slarmm.f b/lapack-netlib/SRC/slarmm.f
new file mode 100644
index 000000000..643dd6748
--- /dev/null
+++ b/lapack-netlib/SRC/slarmm.f
@@ -0,0 +1,99 @@
+*> \brief \b SLARMM
+*
+* Definition:
+* ===========
+*
+*      REAL FUNCTION SLARMM( ANORM, BNORM, CNORM )
+*
+*     .. Scalar Arguments ..
+*      REAL               ANORM, BNORM, CNORM
+*     ..
+*
+*>  \par Purpose:
+*  =======
+*>
+*> \verbatim
+*>
+*> SLARMM returns a factor s in (0, 1] such that the linear updates
+*>
+*>    (s * C) - A * (s * B)  and  (s * C) - (s * A) * B
+*>
+*> cannot overflow, where A, B, and C are matrices of conforming
+*> dimensions.
+*>
+*> This is an auxiliary routine so there is no argument checking.
+*> \endverbatim
+*
+*  Arguments:
+*  =========
+*
+*> \param[in] ANORM
+*> \verbatim
+*>          ANORM is REAL
+*>          The infinity norm of A. ANORM >= 0.
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] BNORM
+*> \verbatim
+*>          BNORM is REAL
+*>          The infinity norm of B. BNORM >= 0.
+*> \endverbatim
+*>
+*> \param[in] CNORM
+*> \verbatim
+*>          CNORM is REAL
+*>          The infinity norm of C. CNORM >= 0.
+*> \endverbatim
+*>
+*>
+*  =====================================================================
+*>  References:
+*>    C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for
+*>    Robust Solution of Triangular Linear Systems. In: International
+*>    Conference on Parallel Processing and Applied Mathematics, pages
+*>    68--78. Springer, 2017.
+*>
+*> \ingroup OTHERauxiliary
+*  =====================================================================
+
+      REAL FUNCTION SLARMM( ANORM, BNORM, CNORM )
+      IMPLICIT NONE
+*     .. Scalar Arguments ..
+      REAL               ANORM, BNORM, CNORM
+*     .. Parameters ..
+      REAL               ONE, HALF, FOUR
+      PARAMETER          ( ONE = 1.0E0, HALF = 0.5E+0, FOUR = 4.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      REAL               BIGNUM, SMLNUM
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH
+      EXTERNAL           SLAMCH
+*     ..
+*     .. Executable Statements ..
+*
+*
+*     Determine machine dependent parameters to control overflow.
+*
+      SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' )
+      BIGNUM = ( ONE / SMLNUM ) / FOUR
+*
+*     Compute a scale factor.
+*
+      SLARMM = ONE
+      IF( BNORM .LE. ONE ) THEN
+         IF( ANORM * BNORM .GT. BIGNUM - CNORM ) THEN
+            SLARMM = HALF
+         END IF
+      ELSE
+         IF( ANORM .GT. (BIGNUM - CNORM) / BNORM ) THEN
+            SLARMM = HALF / BNORM
+         END IF
+      END IF
+      RETURN
+*
+*     ==== End of SLARMM ====
+*
+      END
diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f
new file mode 100644
index 000000000..c3a08e524
--- /dev/null
+++ b/lapack-netlib/SRC/slatrs3.f
@@ -0,0 +1,656 @@
+*> \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
+*                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          DIAG, NORMIN, TRANS, UPLO
+*       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), CNORM( * ), SCALE( * ), 
+*                          WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLATRS3 solves one of the triangular systems
+*>
+*>    A * X = B * diag(scale)  or  A**T * X = B * diag(scale)
+*>
+*> with scaling to prevent overflow.  Here A is an upper or lower
+*> triangular matrix, A**T denotes the transpose of A. X and B are
+*> n by nrhs matrices and scale is an nrhs element vector of scaling
+*> factors. A scaling factor scale(j) is usually less than or equal
+*> to 1, chosen such that X(:,j) is less than the overflow threshold.
+*> If the matrix A is singular (A(j,j) = 0 for some j), then
+*> a non-trivial solution to A*X = 0 is returned. If the system is
+*> so badly scaled that the solution cannot be represented as
+*> (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
+*>
+*> This is a BLAS-3 version of LATRS for solving several right
+*> hand sides simultaneously.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          Specifies whether the matrix A is upper or lower triangular.
+*>          = 'U':  Upper triangular
+*>          = 'L':  Lower triangular
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the operation applied to A.
+*>          = 'N':  Solve A * x = s*b  (No transpose)
+*>          = 'T':  Solve A**T* x = s*b  (Transpose)
+*>          = 'C':  Solve A**T* x = s*b  (Conjugate transpose = Transpose)
+*> \endverbatim
+*>
+*> \param[in] DIAG
+*> \verbatim
+*>          DIAG is CHARACTER*1
+*>          Specifies whether or not the matrix A is unit triangular.
+*>          = 'N':  Non-unit triangular
+*>          = 'U':  Unit triangular
+*> \endverbatim
+*>
+*> \param[in] NORMIN
+*> \verbatim
+*>          NORMIN is CHARACTER*1
+*>          Specifies whether CNORM has been set or not.
+*>          = 'Y':  CNORM contains the column norms on entry
+*>          = 'N':  CNORM is not set on entry.  On exit, the norms will
+*>                  be computed and stored in CNORM.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of columns of X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          The triangular matrix A.  If UPLO = 'U', the leading n by n
+*>          upper triangular part of the array A contains the upper
+*>          triangular matrix, and the strictly lower triangular part of
+*>          A is not referenced.  If UPLO = 'L', the leading n by n lower
+*>          triangular part of the array A contains the lower triangular
+*>          matrix, and the strictly upper triangular part of A is not
+*>          referenced.  If DIAG = 'U', the diagonal elements of A are
+*>          also not referenced and are assumed to be 1.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max (1,N).
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is REAL array, dimension (LDX,NRHS)
+*>          On entry, the right hand side B of the triangular system.
+*>          On exit, X is overwritten by the solution matrix X.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max (1,N).
+*> \endverbatim
+*>
+*> \param[out] SCALE
+*> \verbatim
+*>          SCALE is REAL array, dimension (NRHS)
+*>          The scaling factor s(k) is for the triangular system
+*>          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k).
+*>          If SCALE = 0, the matrix A is singular or badly scaled.
+*>          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
+*>          that is an exact or approximate solution to A*x(:,k) = 0
+*>          is returned. If the system so badly scaled that solution
+*>          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
+*>          is returned.
+*> \endverbatim
+*>
+*> \param[in,out] CNORM
+*> \verbatim
+*>          CNORM is REAL array, dimension (N)
+*>
+*>          If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
+*>          contains the norm of the off-diagonal part of the j-th column
+*>          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal
+*>          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
+*>          must be greater than or equal to the 1-norm.
+*>
+*>          If NORMIN = 'N', CNORM is an output argument and CNORM(j)
+*>          returns the 1-norm of the offdiagonal part of the j-th column
+*>          of A.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (LWORK).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal size of
+*>          WORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*>          LWORK is INTEGER
+*>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
+*>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal dimensions of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -k, the k-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleOTHERauxiliary
+*> \par Further Details:
+*  =====================
+*  \verbatim
+*  The algorithm follows the structure of a block triangular solve.
+*  The diagonal block is solved with a call to the robust the triangular
+*  solver LATRS for every right-hand side RHS = 1, ..., NRHS
+*     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
+*  where op( A ) = A or op( A ) = A**T.
+*  The linear block updates operate on block columns of X,
+*     B( I, K ) - op(A( I, J )) * X( J, K )
+*  and use GEMM. To avoid overflow in the linear block update, the worst case
+*  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
+*  such that
+*     || s * B( I, RHS )||_oo
+*   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold
+*
+*  Once all columns of a block column have been rescaled (BLAS-1), the linear
+*  update is executed with GEMM without overflow.
+*
+*  To limit rescaling, local scale factors track the scaling of column segments.
+*  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
+*  per right-hand side column RHS = 1, ..., NRHS. The global scale factor
+*  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
+*  I = 1, ..., NBA.
+*  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
+*  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
+*  linear update of potentially inconsistently scaled vector segments
+*     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
+*  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
+*  if necessary, rescales the blocks prior to calling GEMM.
+*
+*  \endverbatim
+*  =====================================================================
+*  References:
+*  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
+*  Parallel robust solution of triangular linear systems. Concurrency
+*  and Computation: Practice and Experience, 31(19), e5064.
+*
+*  Contributor:
+*   Angelika Schwarz, Umea University, Sweden.
+*
+*  =====================================================================
+      SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
+     $                    X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
+      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIAG, TRANS, NORMIN, UPLO
+      INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), CNORM( * ), X( LDX, * ),
+     $                   SCALE( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      INTEGER            NBMAX, NBMIN, NBRHS, NRHSMIN
+      PARAMETER          ( NRHSMIN = 2, NBRHS = 32 )
+      PARAMETER          ( NBMIN = 8, NBMAX = 64 )
+*     ..
+*     .. Local Arrays ..
+      REAL               W( NBMAX ), XNRM( NBRHS )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
+      INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
+     $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+      REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
+     $                   SCAMIN, SMLNUM, TMAX
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, SLANGE, SLARMM
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANGE, SLARMM
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLATRS, SSCAL, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      UPPER = LSAME( UPLO, 'U' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      NOUNIT = LSAME( DIAG, 'N' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+*     Partition A and X into blocks.
+*
+      NB = MAX( 8, ILAENV( 1, 'SLATRS', '', N, N, -1, -1 ) )
+      NB = MIN( NBMAX, NB )
+      NBA = MAX( 1, (N + NB - 1) / NB )
+      NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
+*
+*     Compute the workspace
+*
+*     The workspace comprises two parts.
+*     The first part stores the local scale factors. Each simultaneously
+*     computed right-hand side requires one local scale factor per block
+*     row. WORK( I + KK * LDS ) is the scale factor of the vector
+*     segment associated with the I-th block row and the KK-th vector
+*     in the block column.
+      LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
+      LDS = NBA
+*     The second part stores upper bounds of the triangular A. There are
+*     a total of NBA x NBA blocks, of which only the upper triangular
+*     part or the lower triangular part is referenced. The upper bound of
+*     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+      LANRM = NBA * NBA
+      AWRK = LSCALE
+      WORK( 1 ) = LSCALE + LANRM
+*
+*     Test the input parameters.
+*
+      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
+         INFO = -3
+      ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
+     $         LSAME( NORMIN, 'N' ) ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+         INFO = -10
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+         INFO = -14
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SLATRS3', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Initialize scaling factors
+*
+      DO KK = 1, NRHS
+         SCALE( KK ) = ONE
+      END DO
+*
+*     Quick return if possible
+*
+      IF( MIN( N, NRHS ).EQ.0 )
+     $   RETURN
+*
+*     Determine machine dependent constant to control overflow.
+*
+      BIGNUM = SLAMCH( 'Overflow' )
+      SMLNUM = SLAMCH( 'Safe Minimum' )
+*
+*     Use unblocked code for small problems
+*
+      IF( NRHS.LT.NRHSMIN ) THEN
+         CALL SLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1),
+     $                SCALE( 1 ), CNORM, INFO )
+         DO K = 2, NRHS
+            CALL SLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
+     $                   SCALE( K ), CNORM, INFO )
+         END DO
+         RETURN
+      END IF
+*
+*     Compute norms of blocks of A excluding diagonal blocks and find
+*     the block with the largest norm TMAX.
+*
+      TMAX = ZERO
+      DO J = 1, NBA
+         J1 = (J-1)*NB + 1
+         J2 = MIN( J*NB, N ) + 1
+         IF ( UPPER ) THEN
+            IFIRST = 1
+            ILAST = J - 1
+         ELSE
+            IFIRST = J + 1
+            ILAST = NBA
+         END IF
+         DO I = IFIRST, ILAST
+            I1 = (I-1)*NB + 1
+            I2 = MIN( I*NB, N ) + 1
+*
+*           Compute upper bound of A( I1:I2-1, J1:J2-1 ).
+*
+            IF( NOTRAN ) THEN
+               ANRM = SLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
+               WORK( AWRK + I+(J-1)*NBA ) = ANRM
+            ELSE
+               ANRM = SLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
+               WORK( AWRK + J+(I-1)*NBA ) = ANRM
+            END IF
+            TMAX = MAX( TMAX, ANRM )
+         END DO
+      END DO
+*
+      IF( .NOT. TMAX.LE.SLAMCH('Overflow') ) THEN
+*
+*        Some matrix entries have huge absolute value. At least one upper
+*        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
+*        number, either due to overflow in LANGE or due to Inf in A.
+*        Fall back to LATRS. Set normin = 'N' for every right-hand side to
+*        force computation of TSCAL in LATRS to avoid the likely overflow
+*        in the computation of the column norms CNORM.
+*
+         DO K = 1, NRHS
+            CALL SLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
+     $                   SCALE( K ), CNORM, INFO )
+         END DO
+         RETURN
+      END IF
+*
+*     Every right-hand side requires workspace to store NBA local scale
+*     factors. To save workspace, X is computed successively in block columns
+*     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
+*     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
+      DO K = 1, NBX
+*        Loop over block columns (index = K) of X and, for column-wise scalings,
+*        over individual columns (index = KK).
+*        K1: column index of the first column in X( J, K )
+*        K2: column index of the first column in X( J, K+1 )
+*        so the K2 - K1 is the column count of the block X( J, K )
+         K1 = (K-1)*NBRHS + 1
+         K2 = MIN( K*NBRHS, NRHS ) + 1
+*
+*        Initialize local scaling factors of current block column X( J, K )
+*
+         DO KK = 1, K2 - K1
+            DO I = 1, NBA
+               WORK( I+KK*LDS ) = ONE
+            END DO
+         END DO
+*
+         IF( NOTRAN ) THEN
+*
+*           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
+*
+            IF( UPPER ) THEN
+               JFIRST = NBA
+               JLAST = 1
+               JINC = -1
+            ELSE
+               JFIRST = 1
+               JLAST = NBA
+               JINC = 1
+            END IF
+         ELSE
+*
+*           Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
+*
+            IF( UPPER ) THEN
+               JFIRST = 1
+               JLAST = NBA
+               JINC = 1
+            ELSE
+               JFIRST = NBA
+               JLAST = 1
+               JINC = -1
+            END IF
+         END IF
+*
+         DO J = JFIRST, JLAST, JINC
+*           J1: row index of the first row in A( J, J )
+*           J2: row index of the first row in A( J+1, J+1 )
+*           so that J2 - J1 is the row count of the block A( J, J )
+            J1 = (J-1)*NB + 1
+            J2 = MIN( J*NB, N ) + 1
+*
+*           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
+*           for all right-hand sides in the current block column,
+*           one RHS at a time.
+*
+            DO KK = 1, K2-K1
+               RHS = K1 + KK - 1
+               IF( KK.EQ.1 ) THEN
+                  CALL SLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
+     $                         A( J1, J1 ), LDA, X( J1, RHS ),
+     $                         SCALOC, CNORM, INFO )
+               ELSE
+                  CALL SLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
+     $                         A( J1, J1 ), LDA, X( J1, RHS ),
+     $                         SCALOC, CNORM, INFO )
+               END IF
+*              Find largest absolute value entry in the vector segment
+*              X( J1:J2-1, RHS ) as an upper bound for the worst case
+*              growth in the linear updates.
+               XNRM( KK ) = SLANGE( 'I', J2-J1, 1, X( J1, RHS ),
+     $                              LDX, W )
+*
+               IF( SCALOC .EQ. ZERO ) THEN
+*                 LATRS found that A is singular through A(j,j) = 0.
+*                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
+*                 and compute A*x = 0 (or A**T*x = 0). Note that
+*                 X(J1:J2-1, KK) is set by LATRS.
+                  SCALE( RHS ) = ZERO
+                  DO II = 1, J1-1
+                     X( II, KK ) = ZERO
+                  END DO
+                  DO II = J2, N
+                     X( II, KK ) = ZERO
+                  END DO
+*                 Discard the local scale factors.
+                  DO II = 1, NBA
+                     WORK( II+KK*LDS ) = ONE
+                  END DO
+                  SCALOC = ONE
+               ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN
+*                 LATRS computed a valid scale factor, but combined with
+*                 the current scaling the solution does not have a
+*                 scale factor > 0.
+*
+*                 Set WORK( J+KK*LDS ) to smallest valid scale
+*                 factor and increase SCALOC accordingly.
+                  SCAL = WORK( J+KK*LDS ) / SMLNUM
+                  SCALOC = SCALOC * SCAL
+                  WORK( J+KK*LDS ) = SMLNUM
+*                 If LATRS overestimated the growth, x may be
+*                 rescaled to preserve a valid combined scale
+*                 factor WORK( J, KK ) > 0.
+                  RSCAL = ONE / SCALOC
+                  IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN
+                     XNRM( KK ) = XNRM( KK ) * RSCAL
+                     CALL SSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
+                     SCALOC = ONE
+                  ELSE
+*                    The system op(A) * x = b is badly scaled and its
+*                    solution cannot be represented as (1/scale) * x.
+*                    Set x to zero. This approach deviates from LATRS
+*                    where a completely meaningless non-zero vector
+*                    is returned that is not a solution to op(A) * x = b.
+                     SCALE( RHS ) = ZERO
+                     DO II = 1, N
+                        X( II, KK ) = ZERO
+                     END DO
+*                    Discard the local scale factors.
+                     DO II = 1, NBA
+                        WORK( II+KK*LDS ) = ONE
+                     END DO
+                     SCALOC = ONE
+                  END IF
+               END IF
+               SCALOC = SCALOC * WORK( J+KK*LDS )
+               WORK( J+KK*LDS ) = SCALOC
+            END DO
+*
+*           Linear block updates
+*
+            IF( NOTRAN ) THEN
+               IF( UPPER ) THEN
+                  IFIRST = J - 1
+                  ILAST = 1
+                  IINC = -1
+               ELSE
+                  IFIRST = J + 1
+                  ILAST = NBA
+                  IINC = 1
+               END IF
+            ELSE
+               IF( UPPER ) THEN
+                  IFIRST = J + 1
+                  ILAST = NBA
+                  IINC = 1
+               ELSE
+                  IFIRST = J - 1
+                  ILAST = 1
+                  IINC = -1
+               END IF
+            END IF
+*
+            DO I = IFIRST, ILAST, IINC
+*              I1: row index of the first column in X( I, K )
+*              I2: row index of the first column in X( I+1, K )
+*              so the I2 - I1 is the row count of the block X( I, K )
+               I1 = (I-1)*NB + 1
+               I2 = MIN( I*NB, N ) + 1
+*
+*              Prepare the linear update to be executed with GEMM.
+*              For each column, compute a consistent scaling, a
+*              scaling factor to survive the linear update, and
+*              rescale the column segments, if necesssary. Then
+*              the linear update is safely executed.
+*
+               DO KK = 1, K2-K1
+                  RHS = K1 + KK - 1
+*                 Compute consistent scaling
+                  SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  BNRM = SLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
+                  BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
+                  XNRM( KK ) = XNRM( KK )*(SCAMIN / WORK( J+KK*LDS ))
+                  ANRM = WORK( AWRK + I+(J-1)*NBA )
+                  SCALOC = SLARMM( ANRM, XNRM( KK ), BNRM )
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to B( I, KK ) and B( J, KK ).
+*
+                  SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
+                  IF( SCAL.NE.ONE ) THEN
+                     CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
+                     WORK( I+KK*LDS ) = SCAMIN*SCALOC
+                  END IF
+*
+                  SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
+                  IF( SCAL.NE.ONE ) THEN
+                     CALL SSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
+                     WORK( J+KK*LDS ) = SCAMIN*SCALOC
+                  END IF
+               END DO
+*
+               IF( NOTRAN ) THEN
+*
+*                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
+*
+                  CALL SGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -ONE,
+     $                        A( I1, J1 ), LDA, X( J1, K1 ), LDX,
+     $                        ONE, X( I1, K1 ), LDX )
+               ELSE
+*
+*                 B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K )
+*
+                  CALL SGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -ONE,
+     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
+     $                        ONE, X( I1, K1 ), LDX )
+               END IF
+            END DO
+         END DO
+*
+*        Reduce local scaling factors
+*
+         DO KK = 1, K2-K1
+            RHS = K1 + KK - 1
+            DO I = 1, NBA
+               SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
+            END DO
+         END DO
+*
+*        Realize consistent scaling
+*
+         DO KK = 1, K2-K1
+            RHS = K1 + KK - 1
+            IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
+               DO I = 1, NBA
+                  I1 = (I-1)*NB + 1
+                  I2 = MIN( I*NB, N ) + 1
+                  SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
+                  IF( SCAL.NE.ONE )
+     $               CALL SSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
+               END DO
+            END IF
+         END DO
+      END DO
+      RETURN
+*
+*     End of SLATRS3
+*
+      END
diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f
new file mode 100644
index 000000000..28762c2ed
--- /dev/null
+++ b/lapack-netlib/SRC/strsyl3.f
@@ -0,0 +1,1244 @@
+*> \brief \b STRSYL3
+*
+* Definition:
+* ===========
+*
+*
+*>  \par Purpose
+*  =============
+*>
+*> \verbatim
+*>
+*>  STRSYL3 solves the real Sylvester matrix equation:
+*>
+*>     op(A)*X + X*op(B) = scale*C or
+*>     op(A)*X - X*op(B) = scale*C,
+*>
+*>  where op(A) = A or A**T, and  A and B are both upper quasi-
+*>  triangular. A is M-by-M and B is N-by-N; the right hand side C and
+*>  the solution X are M-by-N; and scale is an output scale factor, set
+*>  <= 1 to avoid overflow in X.
+*>
+*>  A and B must be in Schur canonical form (as returned by SHSEQR), that
+*>  is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks;
+*>  each 2-by-2 diagonal block has its diagonal elements equal and its
+*>  off-diagonal elements of opposite sign.
+*>
+*>  This is the block version of the algorithm.
+*> \endverbatim
+*
+*  Arguments
+*  =========
+*
+*> \param[in] TRANA
+*> \verbatim
+*>          TRANA is CHARACTER*1
+*>          Specifies the option op(A):
+*>          = 'N': op(A) = A    (No transpose)
+*>          = 'T': op(A) = A**T (Transpose)
+*>          = 'C': op(A) = A**H (Conjugate transpose = Transpose)
+*> \endverbatim
+*>
+*> \param[in] TRANB
+*> \verbatim
+*>          TRANB is CHARACTER*1
+*>          Specifies the option op(B):
+*>          = 'N': op(B) = B    (No transpose)
+*>          = 'T': op(B) = B**T (Transpose)
+*>          = 'C': op(B) = B**H (Conjugate transpose = Transpose)
+*> \endverbatim
+*>
+*> \param[in] ISGN
+*> \verbatim
+*>          ISGN is INTEGER
+*>          Specifies the sign in the equation:
+*>          = +1: solve op(A)*X + X*op(B) = scale*C
+*>          = -1: solve op(A)*X - X*op(B) = scale*C
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The order of the matrix A, and the number of rows in the
+*>          matrices X and C. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix B, and the number of columns in the
+*>          matrices X and C. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,M)
+*>          The upper quasi-triangular matrix A, in Schur canonical form.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,N)
+*>          The upper quasi-triangular matrix B, in Schur canonical form.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (LDC,N)
+*>          On entry, the M-by-N right hand side matrix C.
+*>          On exit, C is overwritten by the solution matrix X.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M)
+*> \endverbatim
+*>
+*> \param[out] SCALE
+*> \verbatim
+*>          SCALE is REAL
+*>          The scale factor, scale, set <= 1 to avoid overflow in X.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          IWORK is INTEGER
+*>          The dimension of the array IWORK. LIWORK >=  ((M + NB - 1) / NB + 1)
+*>          + ((N + NB - 1) / NB + 1), where NB is the optimal block size.
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal dimension of the IWORK array,
+*>          returns this value as the first entry of the IWORK array, and
+*>          no error message related to LIWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] SWORK
+*> \verbatim
+*>          SWORK is REAL array, dimension (MAX(2, ROWS),
+*>          MAX(1,COLS)).
+*>          On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS
+*>          and SWORK(2) returns the optimal COLS.
+*> \endverbatim
+*>
+*> \param[in] LDSWORK
+*> \verbatim
+*>          LDSWORK is INTEGER
+*>          LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1)
+*>          and NB is the optimal block size.
+*>
+*>          If LDSWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal dimensions of the SWORK matrix,
+*>          returns these values as the first and second entry of the SWORK
+*>          matrix, and no error message related LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0: successful exit
+*>          < 0: if INFO = -i, the i-th argument had an illegal value
+*>          = 1: A and B have common or very close eigenvalues; perturbed
+*>               values were used to solve the equation (but the matrices
+*>               A and B are unchanged).
+*> \endverbatim
+*
+*  =====================================================================
+*  References:
+*   E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of
+*   algorithms: The triangular Sylvester equation, ACM Transactions
+*   on Mathematical Software (TOMS), volume 29, pages 218--243.
+*
+*   A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel
+*   Solution of the Triangular Sylvester Equation. Lecture Notes in
+*   Computer Science, vol 12043, pages 82--92, Springer.
+*
+*  Contributor:
+*   Angelika Schwarz, Umea University, Sweden.
+*
+*  =====================================================================
+      SUBROUTINE STRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C,
+     $                    LDC, SCALE, IWORK, LIWORK, SWORK, LDSWORK,
+     $                    INFO )
+      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+      CHARACTER          TRANA, TRANB
+      INTEGER            INFO, ISGN, LDA, LDB, LDC, M, N,
+     $                   LIWORK, LDSWORK
+      REAL               SCALE
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * )
+      REAL               A( LDA, * ), B( LDB, * ), C( LDC, * ),
+     $                   SWORK( LDSWORK, * )
+*     ..
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            NOTRNA, NOTRNB, LQUERY, SKIP
+      INTEGER            AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ,
+     $                   K, K1, K2, L, L1, L2, LL, NBA, NB, NBB, PC
+      REAL               ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC,
+     $                   SCAMIN, SGN, XNRM, BUF, SMLNUM
+*     ..
+*     .. Local Arrays ..
+      REAL               WNRM( MAX( M, N ) )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLANGE, SLAMCH, SLARMM
+      EXTERNAL           SLANGE, SLAMCH, SLARMM, ILAENV, LSAME
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGEMM, SLASCL, SSCAL, STRSYL, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, EXPONENT, MAX, MIN, REAL
+*     ..
+*     .. Executable Statements ..
+*
+*     Decode and Test input parameters
+*
+      NOTRNA = LSAME( TRANA, 'N' )
+      NOTRNB = LSAME( TRANB, 'N' )
+*
+*     Use the same block size for all matrices.
+*
+      NB = MAX(8, ILAENV( 1, 'STRSYL', '', M, N, -1, -1) )
+*
+*     Compute number of blocks in A and B
+*
+      NBA = MAX( 1, (M + NB - 1) / NB )
+      NBB = MAX( 1, (N + NB - 1) / NB )
+*
+*     Compute workspace
+*
+      INFO = 0
+      LQUERY = ( LIWORK.EQ.-1 .OR. LDSWORK.EQ.-1 )
+      IWORK( 1 ) = NBA + NBB + 2
+      IF( LQUERY ) THEN
+         LDSWORK = 2
+         SWORK( 1, 1 ) = MAX( NBA, NBB )
+         SWORK( 2, 1 ) = 2 * NBB + NBA
+      END IF
+*
+*     Test the input arguments
+*
+      IF( .NOT.NOTRNA .AND. .NOT.LSAME( TRANA, 'T' ) .AND. .NOT.
+     $    LSAME( TRANA, 'C' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRNB .AND. .NOT.LSAME( TRANB, 'T' ) .AND. .NOT.
+     $         LSAME( TRANB, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN
+         INFO = -3
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+         INFO = -9
+      ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
+         INFO = -11
+      ELSE IF( .NOT.LQUERY .AND. LIWORK.LT.IWORK(1) ) THEN
+         INFO = -14
+      ELSE IF( .NOT.LQUERY .AND. LDSWORK.LT.MAX( NBA, NBB ) ) THEN
+         INFO = -16
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'STRSYL3', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      SCALE = ONE
+      IF( M.EQ.0 .OR. N.EQ.0 )
+     $   RETURN
+*
+*     Use unblocked code for small problems or if insufficient
+*     workspaces are provided
+*
+      IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) .OR.
+     $    LIWORK.LT.IWORK(1) ) THEN
+        CALL STRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB,
+     $               C, LDC, SCALE, INFO )
+        RETURN
+      END IF
+*
+*     Set constants to control overflow
+*
+      SMLNUM = SLAMCH( 'S' )
+      BIGNUM = ONE / SMLNUM
+*
+*      Partition A such that 2-by-2 blocks on the diagonal are not split
+*
+       SKIP = .FALSE.
+       DO I = 1, NBA
+          IWORK( I ) = ( I - 1 ) * NB + 1
+       END DO
+       IWORK( NBA + 1 ) = M + 1
+       DO K = 1, NBA
+          L1 = IWORK( K )
+          L2 = IWORK( K + 1 ) - 1
+          DO L = L1, L2
+             IF( SKIP ) THEN
+                SKIP = .FALSE.
+                CYCLE
+             END IF
+             IF( L.GE.M ) THEN
+*               A( M, M ) is a 1-by-1 block
+                CYCLE
+             END IF
+             IF( A( L, L+1 ).NE.ZERO .AND. A( L+1, L ).NE.ZERO ) THEN
+*               Check if 2-by-2 block is split
+                IF( L + 1 .EQ. IWORK( K + 1 ) ) THEN
+                   IWORK( K + 1 ) = IWORK( K + 1 ) + 1
+                   CYCLE
+                END IF
+                SKIP = .TRUE.
+             END IF
+          END DO
+       END DO
+       IWORK( NBA + 1 ) = M + 1
+       IF( IWORK( NBA ).GE.IWORK( NBA + 1 ) ) THEN
+          IWORK( NBA ) = IWORK( NBA + 1 )
+          NBA = NBA - 1
+       END IF
+*
+*      Partition B such that 2-by-2 blocks on the diagonal are not split
+*
+       PC = NBA + 1
+       SKIP = .FALSE.
+       DO I = 1, NBB
+          IWORK( PC + I ) = ( I - 1 ) * NB + 1
+       END DO
+       IWORK( PC + NBB + 1 ) = N + 1
+       DO K = 1, NBB
+          L1 = IWORK( PC + K )
+          L2 = IWORK( PC + K + 1 ) - 1
+          DO L = L1, L2
+             IF( SKIP ) THEN
+                SKIP = .FALSE.
+                CYCLE
+             END IF
+             IF( L.GE.N ) THEN
+*               B( N, N ) is a 1-by-1 block
+                CYCLE
+             END IF
+             IF( B( L, L+1 ).NE.ZERO .AND. B( L+1, L ).NE.ZERO ) THEN
+*               Check if 2-by-2 block is split
+                IF( L + 1 .EQ. IWORK( PC + K + 1 ) ) THEN
+                   IWORK( PC + K + 1 ) = IWORK( PC + K + 1 ) + 1
+                   CYCLE
+                END IF
+                SKIP = .TRUE.
+             END IF
+          END DO
+       END DO
+       IWORK( PC + NBB + 1 ) = N + 1
+       IF( IWORK( PC + NBB ).GE.IWORK( PC + NBB + 1 ) ) THEN
+          IWORK( PC + NBB ) = IWORK( PC + NBB + 1 )
+          NBB = NBB - 1
+       END IF
+*
+*     Set local scaling factors - must never attain zero.
+*
+      DO L = 1, NBB
+         DO K = 1, NBA
+            SWORK( K, L ) = ONE
+         END DO
+      END DO
+*
+*     Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero.
+*     This scaling is to ensure compatibility with TRSYL and may get flushed.
+*
+      BUF = ONE
+*
+*     Compute upper bounds of blocks of A and B
+*
+      AWRK = NBB
+      DO K = 1, NBA
+         K1 = IWORK( K )
+         K2 = IWORK( K + 1 )
+         DO L = K, NBA
+            L1 = IWORK( L )
+            L2 = IWORK( L + 1 )
+            IF( NOTRNA ) THEN
+               SWORK( K, AWRK + L ) = SLANGE( 'I', K2-K1, L2-L1,
+     $                                        A( K1, L1 ), LDA, WNRM )
+            ELSE
+               SWORK( L, AWRK + K ) = SLANGE( '1', K2-K1, L2-L1,
+     $                                        A( K1, L1 ), LDA, WNRM )
+            END IF
+         END DO
+      END DO
+      BWRK = NBB + NBA
+      DO K = 1, NBB
+         K1 = IWORK( PC + K )
+         K2 = IWORK( PC + K + 1 )
+         DO L = K, NBB
+            L1 = IWORK( PC + L )
+            L2 = IWORK( PC + L + 1 )
+            IF( NOTRNB ) THEN
+               SWORK( K, BWRK + L ) = SLANGE( 'I', K2-K1, L2-L1,
+     $                                        B( K1, L1 ), LDB, WNRM )
+            ELSE
+               SWORK( L, BWRK + K ) = SLANGE( '1', K2-K1, L2-L1,
+     $                                        B( K1, L1 ), LDB, WNRM )
+            END IF
+         END DO
+      END DO
+*
+      SGN = REAL( ISGN )
+*
+      IF( NOTRNA .AND. NOTRNB ) THEN
+*
+*        Solve    A*X + ISGN*X*B = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        bottom-left corner column by column by
+*
+*         A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L)
+*
+*        Where
+*                  M                         L-1
+*        R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)].
+*                I=K+1                       J=1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = NBA, 1, -1
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = IWORK( K )
+            K2 = IWORK( K + 1 )
+            DO L = 1, NBB
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = IWORK( PC + L )
+               L2 = IWORK( PC + L + 1 )
+*
+               CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF ( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K - 1, 1, -1
+*
+*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L )
+*
+                  I1 = IWORK( I )
+                  I2 = IWORK( I + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = SLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                      DO JJ = L1, L2-1
+                         CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1)
+                      END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                      DO LL = L1, L2-1
+                         CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1)
+                      END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE,
+     $                        A( I1, K1 ), LDA, C( K1, L1 ), LDC,
+     $                        ONE, C( I1, L1 ), LDC )
+*
+               END DO
+*
+               DO J = L + 1, NBB
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J )
+*
+                  J1 = IWORK( PC + J )
+                  J2 = IWORK( PC + J + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK(L, BWRK + J)
+                  SCALOC = SLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                      DO JJ = J1, J2-1
+                         CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                      END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN,
+     $                        C( K1, L1 ), LDC, B( L1, J1 ), LDB,
+     $                        ONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN
+*
+*        Solve    A**T*X + ISGN*X*B = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        upper-left corner column by column by
+*
+*          A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L)
+*
+*        Where
+*                   K-1                        L-1
+*          R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)]
+*                   I=1                        J=1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = 1, NBA
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = IWORK( K )
+            K2 = IWORK( K + 1 )
+            DO L = 1, NBB
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = IWORK( PC + L )
+               L2 = IWORK( PC + L + 1 )
+*
+               CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K + 1, NBA
+*
+*                 C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L )
+*
+                  I1 = IWORK( I )
+                  I2 = IWORK( I + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = SLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to to C( I, L ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE,
+     $                        A( K1, I1 ), LDA, C( K1, L1 ), LDC,
+     $                        ONE, C( I1, L1 ), LDC )
+               END DO
+*
+               DO J = L + 1, NBB
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J )
+*
+                  J1 = IWORK( PC + J )
+                  J2 = IWORK( PC + J + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = SLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to to C( K, J ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                      DO LL = L1, L2-1
+                         CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                      END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL SGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -SGN,
+     $                        C( K1, L1 ), LDC, B( L1, J1 ), LDB,
+     $                        ONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN
+*
+*        Solve    A**T*X + ISGN*X*B**T = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        top-right corner column by column by
+*
+*           A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L)
+*
+*        Where
+*                     K-1                          N
+*            R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T].
+*                     I=1                        J=L+1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = 1, NBA
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = IWORK( K )
+            K2 = IWORK( K + 1 )
+            DO L = NBB, 1, -1
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = IWORK( PC + L )
+               L2 = IWORK( PC + L + 1 )
+*
+               CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K + 1, NBA
+*
+*                 C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L )
+*
+                  I1 = IWORK( I )
+                  I2 = IWORK( I + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = SLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL SGEMM( 'T', 'N', I2-I1, L2-L1, K2-K1, -ONE,
+     $                        A( K1, I1 ), LDA, C( K1, L1 ), LDC,
+     $                        ONE, C( I1, L1 ), LDC )
+               END DO
+*
+               DO J = 1, L - 1
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T
+*
+                  J1 = IWORK( PC + J )
+                  J2 = IWORK( PC + J + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = SLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1)
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN,
+     $                        C( K1, L1 ), LDC, B( J1, L1 ), LDB,
+     $                        ONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN
+*
+*        Solve    A*X + ISGN*X*B**T = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        bottom-right corner column by column by
+*
+*            A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L)
+*
+*        Where
+*                      M                          N
+*            R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T].
+*                    I=K+1                      J=L+1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = NBA, 1, -1
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = IWORK( K )
+            K2 = IWORK( K + 1 )
+            DO L = NBB, 1, -1
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = IWORK( PC + L )
+               L2 = IWORK( PC + L + 1 )
+*
+               CALL STRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = SLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = 1, K - 1
+*
+*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L )
+*
+                  I1 = IWORK( I )
+                  I2 = IWORK( I + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = SLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = SLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF (SCAL .NE. ONE) THEN
+                     DO LL = L1, L2-1
+                        CALL SSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL SGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -ONE,
+     $                        A( I1, K1 ), LDA, C( K1, L1 ), LDC,
+     $                        ONE, C( I1, L1 ), LDC )
+*
+               END DO
+*
+               DO J = 1, L - 1
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T
+*
+                  J1 = IWORK( PC + J )
+                  J2 = IWORK( PC + J + 1 )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = SLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = SLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.E0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.E0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.E0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.E0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = L1, L2-1
+                        CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL SSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL SGEMM( 'N', 'T', K2-K1, J2-J1, L2-L1, -SGN,
+     $                        C( K1, L1 ), LDC, B( J1, L1 ), LDB,
+     $                        ONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+*
+      END IF
+*
+*     Reduce local scaling factors
+*
+      SCALE = SWORK( 1, 1 )
+      DO K = 1, NBA
+         DO L = 1, NBB
+            SCALE = MIN( SCALE, SWORK( K, L ) )
+         END DO
+      END DO
+*
+      IF( SCALE .EQ. ZERO ) THEN
+*
+*        The magnitude of the largest entry of the solution is larger
+*        than the product of BIGNUM**2 and cannot be represented in the
+*        form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up.
+*
+         IWORK(1) = NBA + NBB + 2
+         SWORK(1,1) = MAX( NBA, NBB )
+         SWORK(2,1) = 2 * NBB + NBA
+         RETURN
+      END IF
+*
+*     Realize consistent scaling
+*
+      DO K = 1, NBA
+         K1 = IWORK( K )
+         K2 = IWORK( K + 1 )
+         DO L = 1, NBB
+            L1 = IWORK( PC + L )
+            L2 = IWORK( PC + L + 1 )
+            SCAL = SCALE / SWORK( K, L )
+            IF( SCAL .NE. ONE ) THEN
+               DO LL = L1, L2-1
+                  CALL SSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+               END DO
+            ENDIF
+         END DO
+      END DO
+*
+      IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN
+*
+*        Decrease SCALE as much as possible.
+*
+         SCALOC = MIN( SCALE / SMLNUM, ONE / BUF )
+         BUF = BUF * SCALOC
+         SCALE = SCALE / SCALOC
+      END IF
+
+      IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN
+*
+*        In case of overly aggressive scaling during the computation,
+*        flushing of the global scale factor may be prevented by
+*        undoing some of the scaling. This step is to ensure that
+*        this routine flushes only scale factors that TRSYL also
+*        flushes and be usable as a drop-in replacement.
+*
+*        How much can the normwise largest entry be upscaled?
+*
+         SCAL = C( 1, 1 )
+         DO K = 1, M
+            DO L = 1, N
+               SCAL = MAX( SCAL, ABS( C( K, L ) ) )
+            END DO
+         END DO
+*
+*        Increase BUF as close to 1 as possible and apply scaling.
+*
+         SCALOC = MIN( BIGNUM / SCAL, ONE / BUF )
+         BUF = BUF * SCALOC
+         CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK )
+      END IF
+*
+*     Combine with buffer scaling factor. SCALE will be flushed if
+*     BUF is less than one here.
+*
+      SCALE = SCALE * BUF
+*
+*     Restore workspace dimensions
+*
+      IWORK(1) = NBA + NBB + 2
+      SWORK(1,1) = MAX( NBA, NBB )
+      SWORK(2,1) = 2 * NBB + NBA
+*
+      RETURN
+*
+*     End of STRSYL3
+*
+      END
diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f
new file mode 100644
index 000000000..fc1be0517
--- /dev/null
+++ b/lapack-netlib/SRC/zlatrs3.f
@@ -0,0 +1,667 @@
+*> \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
+*                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          DIAG, NORMIN, TRANS, UPLO
+*       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   CNORM( * ), SCALE( * ), WORK( * )
+*       COMPLEX*16         A( LDA, * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLATRS3 solves one of the triangular systems
+*>
+*>    A * X = B * diag(scale),  A**T * X = B * diag(scale), or
+*>    A**H * X = B * diag(scale)
+*>
+*> with scaling to prevent overflow.  Here A is an upper or lower
+*> triangular matrix, A**T denotes the transpose of A, A**H denotes the
+*> conjugate transpose of A. X and B are n-by-nrhs matrices and scale
+*> is an nrhs-element vector of scaling factors. A scaling factor scale(j)
+*> is usually less than or equal to 1, chosen such that X(:,j) is less
+*> than the overflow threshold. If the matrix A is singular (A(j,j) = 0
+*> for some j), then a non-trivial solution to A*X = 0 is returned. If
+*> the system is so badly scaled that the solution cannot be represented
+*> as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned.
+*>
+*> This is a BLAS-3 version of LATRS for solving several right
+*> hand sides simultaneously.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] UPLO
+*> \verbatim
+*>          UPLO is CHARACTER*1
+*>          Specifies whether the matrix A is upper or lower triangular.
+*>          = 'U':  Upper triangular
+*>          = 'L':  Lower triangular
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the operation applied to A.
+*>          = 'N':  Solve A * x = s*b  (No transpose)
+*>          = 'T':  Solve A**T* x = s*b  (Transpose)
+*>          = 'C':  Solve A**T* x = s*b  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] DIAG
+*> \verbatim
+*>          DIAG is CHARACTER*1
+*>          Specifies whether or not the matrix A is unit triangular.
+*>          = 'N':  Non-unit triangular
+*>          = 'U':  Unit triangular
+*> \endverbatim
+*>
+*> \param[in] NORMIN
+*> \verbatim
+*>          NORMIN is CHARACTER*1
+*>          Specifies whether CNORM has been set or not.
+*>          = 'Y':  CNORM contains the column norms on entry
+*>          = 'N':  CNORM is not set on entry.  On exit, the norms will
+*>                  be computed and stored in CNORM.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of columns of X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          The triangular matrix A.  If UPLO = 'U', the leading n by n
+*>          upper triangular part of the array A contains the upper
+*>          triangular matrix, and the strictly lower triangular part of
+*>          A is not referenced.  If UPLO = 'L', the leading n by n lower
+*>          triangular part of the array A contains the lower triangular
+*>          matrix, and the strictly upper triangular part of A is not
+*>          referenced.  If DIAG = 'U', the diagonal elements of A are
+*>          also not referenced and are assumed to be 1.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max (1,N).
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension (LDX,NRHS)
+*>          On entry, the right hand side B of the triangular system.
+*>          On exit, X is overwritten by the solution matrix X.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max (1,N).
+*> \endverbatim
+*>
+*> \param[out] SCALE
+*> \verbatim
+*>          SCALE is DOUBLE PRECISION array, dimension (NRHS)
+*>          The scaling factor s(k) is for the triangular system
+*>          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k).
+*>          If SCALE = 0, the matrix A is singular or badly scaled.
+*>          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k)
+*>          that is an exact or approximate solution to A*x(:,k) = 0
+*>          is returned. If the system so badly scaled that solution
+*>          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0
+*>          is returned.
+*> \endverbatim
+*>
+*> \param[in,out] CNORM
+*> \verbatim
+*>          CNORM is DOUBLE PRECISION array, dimension (N)
+*>
+*>          If NORMIN = 'Y', CNORM is an input argument and CNORM(j)
+*>          contains the norm of the off-diagonal part of the j-th column
+*>          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal
+*>          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j)
+*>          must be greater than or equal to the 1-norm.
+*>
+*>          If NORMIN = 'N', CNORM is an output argument and CNORM(j)
+*>          returns the 1-norm of the offdiagonal part of the j-th column
+*>          of A.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (LWORK).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal size of
+*>          WORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*>          LWORK is INTEGER
+*>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
+*>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal dimensions of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -k, the k-th argument had an illegal value
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleOTHERauxiliary
+*> \par Further Details:
+*  =====================
+*  \verbatim
+*  The algorithm follows the structure of a block triangular solve.
+*  The diagonal block is solved with a call to the robust the triangular
+*  solver LATRS for every right-hand side RHS = 1, ..., NRHS
+*     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ),
+*  where op( A ) = A or op( A ) = A**T or op( A ) = A**H.
+*  The linear block updates operate on block columns of X,
+*     B( I, K ) - op(A( I, J )) * X( J, K )
+*  and use GEMM. To avoid overflow in the linear block update, the worst case
+*  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed
+*  such that
+*     || s * B( I, RHS )||_oo
+*   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold
+*
+*  Once all columns of a block column have been rescaled (BLAS-1), the linear
+*  update is executed with GEMM without overflow.
+*
+*  To limit rescaling, local scale factors track the scaling of column segments.
+*  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA
+*  per right-hand side column RHS = 1, ..., NRHS. The global scale factor
+*  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS )
+*  I = 1, ..., NBA.
+*  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS )
+*  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The
+*  linear update of potentially inconsistently scaled vector segments
+*     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) )
+*  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and,
+*  if necessary, rescales the blocks prior to calling GEMM.
+*
+*  \endverbatim
+*  =====================================================================
+*  References:
+*  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019).
+*  Parallel robust solution of triangular linear systems. Concurrency
+*  and Computation: Practice and Experience, 31(19), e5064.
+*
+*  Contributor:
+*   Angelika Schwarz, Umea University, Sweden.
+*
+*  =====================================================================
+      SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA,
+     $                    X, LDX, SCALE, CNORM, WORK, LWORK, INFO )
+      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+      CHARACTER          DIAG, TRANS, NORMIN, UPLO
+      INTEGER            INFO, LDA, LWORK, LDX, N, NRHS
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), X( LDX, * )
+      DOUBLE PRECISION   CNORM( * ), SCALE( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      COMPLEX*16         CZERO, CONE
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ) )
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ) )
+      INTEGER            NBMAX, NBMIN, NBRHS, NRHSMIN
+      PARAMETER          ( NRHSMIN = 2, NBRHS = 32 )
+      PARAMETER          ( NBMIN = 8, NBMAX = 64 )
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   W( NBMAX ), XNRM( NBRHS )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
+      INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
+     $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+      DOUBLE PRECISION   ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
+     $                   SCAMIN, SMLNUM, TMAX
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, ZLANGE, DLARMM
+      EXTERNAL           ILAENV, LSAME, DLAMCH, ZLANGE, DLARMM
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLATRS, ZDSCAL, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      UPPER = LSAME( UPLO, 'U' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      NOUNIT = LSAME( DIAG, 'N' )
+      LQUERY = ( LWORK.EQ.-1 )
+*
+*     Partition A and X into blocks.
+*
+      NB = MAX( NBMIN, ILAENV( 1, 'ZLATRS', '', N, N, -1, -1 ) )
+      NB = MIN( NBMAX, NB )
+      NBA = MAX( 1, (N + NB - 1) / NB )
+      NBX = MAX( 1, (NRHS + NBRHS - 1) / NBRHS )
+*
+*     Compute the workspace
+*
+*     The workspace comprises two parts.
+*     The first part stores the local scale factors. Each simultaneously
+*     computed right-hand side requires one local scale factor per block
+*     row. WORK( I + KK * LDS ) is the scale factor of the vector
+*     segment associated with the I-th block row and the KK-th vector
+*     in the block column.
+      LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
+      LDS = NBA
+*     The second part stores upper bounds of the triangular A. There are
+*     a total of NBA x NBA blocks, of which only the upper triangular
+*     part or the lower triangular part is referenced. The upper bound of
+*     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+      LANRM = NBA * NBA
+      AWRK = LSCALE
+      WORK( 1 ) = LSCALE + LANRM
+*
+*     Test the input parameters.
+*
+      IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN
+         INFO = -3
+      ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT.
+     $         LSAME( NORMIN, 'N' ) ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+         INFO = -10
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+         INFO = -14
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZLATRS3', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Initialize scaling factors
+*
+      DO KK = 1, NRHS
+         SCALE( KK ) = ONE
+      END DO
+*
+*     Quick return if possible
+*
+      IF( MIN( N, NRHS ).EQ.0 )
+     $   RETURN
+*
+*     Determine machine dependent constant to control overflow.
+*
+      BIGNUM = DLAMCH( 'Overflow' )
+      SMLNUM = DLAMCH( 'Safe Minimum' )
+*
+*     Use unblocked code for small problems
+*
+      IF( NRHS.LT.NRHSMIN ) THEN
+         CALL ZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, LDA, X( 1, 1),
+     $                SCALE( 1 ), CNORM, INFO )
+         DO K = 2, NRHS
+            CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', N, A, LDA, X( 1, K ),
+     $                   SCALE( K ), CNORM, INFO )
+         END DO
+         RETURN
+      END IF
+*
+*     Compute norms of blocks of A excluding diagonal blocks and find
+*     the block with the largest norm TMAX.
+*
+      TMAX = ZERO
+      DO J = 1, NBA
+         J1 = (J-1)*NB + 1
+         J2 = MIN( J*NB, N ) + 1
+         IF ( UPPER ) THEN
+            IFIRST = 1
+            ILAST = J - 1
+         ELSE
+            IFIRST = J + 1
+            ILAST = NBA
+         END IF
+         DO I = IFIRST, ILAST
+            I1 = (I-1)*NB + 1
+            I2 = MIN( I*NB, N ) + 1
+*
+*           Compute upper bound of A( I1:I2-1, J1:J2-1 ).
+*
+            IF( NOTRAN ) THEN
+               ANRM = ZLANGE( 'I', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
+               WORK( AWRK + I+(J-1)*NBA ) = ANRM
+            ELSE
+               ANRM = ZLANGE( '1', I2-I1, J2-J1, A( I1, J1 ), LDA, W )
+               WORK( AWRK + J+(I-1) * NBA ) = ANRM
+            END IF
+            TMAX = MAX( TMAX, ANRM )
+         END DO
+      END DO
+*
+      IF( .NOT. TMAX.LE.DLAMCH('Overflow') ) THEN
+*
+*        Some matrix entries have huge absolute value. At least one upper
+*        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point
+*        number, either due to overflow in LANGE or due to Inf in A.
+*        Fall back to LATRS. Set normin = 'N' for every right-hand side to
+*        force computation of TSCAL in LATRS to avoid the likely overflow
+*        in the computation of the column norms CNORM.
+*
+         DO K = 1, NRHS
+            CALL ZLATRS( UPLO, TRANS, DIAG, 'N', N, A, LDA, X( 1, K ),
+     $                   SCALE( K ), CNORM, INFO )
+         END DO
+         RETURN
+      END IF
+*
+*     Every right-hand side requires workspace to store NBA local scale
+*     factors. To save workspace, X is computed successively in block columns
+*     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient
+*     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable.
+      DO K = 1, NBX
+*        Loop over block columns (index = K) of X and, for column-wise scalings,
+*        over individual columns (index = KK).
+*        K1: column index of the first column in X( J, K )
+*        K2: column index of the first column in X( J, K+1 )
+*        so the K2 - K1 is the column count of the block X( J, K )
+         K1 = (K-1)*NBRHS + 1
+         K2 = MIN( K*NBRHS, NRHS ) + 1
+*
+*        Initialize local scaling factors of current block column X( J, K )
+*
+         DO KK = 1, K2 - K1
+            DO I = 1, NBA
+               WORK( I+KK*LDS ) = ONE
+            END DO
+         END DO
+*
+         IF( NOTRAN ) THEN
+*
+*           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
+*
+            IF( UPPER ) THEN
+               JFIRST = NBA
+               JLAST = 1
+               JINC = -1
+            ELSE
+               JFIRST = 1
+               JLAST = NBA
+               JINC = 1
+            END IF
+         ELSE
+*
+*           Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1))
+*           where op(A) = A**T or op(A) = A**H
+*
+            IF( UPPER ) THEN
+               JFIRST = 1
+               JLAST = NBA
+               JINC = 1
+            ELSE
+               JFIRST = NBA
+               JLAST = 1
+               JINC = -1
+            END IF
+         END IF
+
+         DO J = JFIRST, JLAST, JINC
+*           J1: row index of the first row in A( J, J )
+*           J2: row index of the first row in A( J+1, J+1 )
+*           so that J2 - J1 is the row count of the block A( J, J )
+            J1 = (J-1)*NB + 1
+            J2 = MIN( J*NB, N ) + 1
+*
+*           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS )
+*
+            DO KK = 1, K2 - K1
+               RHS = K1 + KK - 1
+               IF( KK.EQ.1 ) THEN
+                  CALL ZLATRS( UPLO, TRANS, DIAG, 'N', J2-J1,
+     $                         A( J1, J1 ), LDA, X( J1, RHS ),
+     $                         SCALOC, CNORM, INFO )
+               ELSE
+                  CALL ZLATRS( UPLO, TRANS, DIAG, 'Y', J2-J1,
+     $                         A( J1, J1 ), LDA, X( J1, RHS ),
+     $                         SCALOC, CNORM, INFO )
+               END IF
+*              Find largest absolute value entry in the vector segment
+*              X( J1:J2-1, RHS ) as an upper bound for the worst case
+*              growth in the linear updates.
+               XNRM( KK ) = ZLANGE( 'I', J2-J1, 1, X( J1, RHS ),
+     $                              LDX, W )
+*
+               IF( SCALOC .EQ. ZERO ) THEN
+*                 LATRS found that A is singular through A(j,j) = 0.
+*                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0
+*                 and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is
+*                 set by LATRS.
+                  SCALE( RHS ) = ZERO
+                  DO II = 1, J1-1
+                     X( II, KK ) = CZERO
+                  END DO
+                  DO II = J2, N
+                     X( II, KK ) = CZERO
+                  END DO
+*                 Discard the local scale factors.
+                  DO II = 1, NBA
+                     WORK( II+KK*LDS ) = ONE
+                  END DO
+                  SCALOC = ONE
+               ELSE IF( SCALOC*WORK( J+KK*LDS ) .EQ. ZERO ) THEN
+*                 LATRS computed a valid scale factor, but combined with
+*                 the current scaling the solution does not have a
+*                 scale factor > 0.
+*
+*                 Set WORK( J+KK*LDS ) to smallest valid scale
+*                 factor and increase SCALOC accordingly.
+                  SCAL = WORK( J+KK*LDS ) / SMLNUM
+                  SCALOC = SCALOC * SCAL
+                  WORK( J+KK*LDS ) = SMLNUM
+*                 If LATRS overestimated the growth, x may be
+*                 rescaled to preserve a valid combined scale
+*                 factor WORK( J, KK ) > 0.
+                  RSCAL = ONE / SCALOC
+                  IF( XNRM( KK )*RSCAL .LE. BIGNUM ) THEN
+                     XNRM( KK ) = XNRM( KK ) * RSCAL
+                     CALL ZDSCAL( J2-J1, RSCAL, X( J1, RHS ), 1 )
+                     SCALOC = ONE
+                  ELSE
+*                    The system op(A) * x = b is badly scaled and its
+*                    solution cannot be represented as (1/scale) * x.
+*                    Set x to zero. This approach deviates from LATRS
+*                    where a completely meaningless non-zero vector
+*                    is returned that is not a solution to op(A) * x = b.
+                     SCALE( RHS ) = ZERO
+                     DO II = 1, N
+                        X( II, KK ) = CZERO
+                     END DO
+*                    Discard the local scale factors.
+                     DO II = 1, NBA
+                        WORK( II+KK*LDS ) = ONE
+                     END DO
+                     SCALOC = ONE
+                  END IF
+               END IF
+               SCALOC = SCALOC * WORK( J+KK*LDS )
+               WORK( J+KK*LDS ) = SCALOC
+            END DO
+*
+*           Linear block updates
+*
+            IF( NOTRAN ) THEN
+               IF( UPPER ) THEN
+                  IFIRST = J - 1
+                  ILAST = 1
+                  IINC = -1
+               ELSE
+                  IFIRST = J + 1
+                  ILAST = NBA
+                  IINC = 1
+               END IF
+            ELSE
+               IF( UPPER ) THEN
+                  IFIRST = J + 1
+                  ILAST = NBA
+                  IINC = 1
+               ELSE
+                  IFIRST = J - 1
+                  ILAST = 1
+                  IINC = -1
+               END IF
+            END IF
+*
+            DO I = IFIRST, ILAST, IINC
+*              I1: row index of the first column in X( I, K )
+*              I2: row index of the first column in X( I+1, K )
+*              so the I2 - I1 is the row count of the block X( I, K )
+               I1 = (I-1)*NB + 1
+               I2 = MIN( I*NB, N ) + 1
+*
+*              Prepare the linear update to be executed with GEMM.
+*              For each column, compute a consistent scaling, a
+*              scaling factor to survive the linear update, and
+*              rescale the column segments, if necesssary. Then
+*              the linear update is safely executed.
+*
+               DO KK = 1, K2 - K1
+                  RHS = K1 + KK - 1
+*                 Compute consistent scaling
+                  SCAMIN = MIN( WORK( I+KK*LDS), WORK( J+KK*LDS ) )
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  BNRM = ZLANGE( 'I', I2-I1, 1, X( I1, RHS ), LDX, W )
+                  BNRM = BNRM*( SCAMIN / WORK( I+KK*LDS ) )
+                  XNRM( KK ) = XNRM( KK )*( SCAMIN / WORK( J+KK*LDS) )
+                  ANRM = WORK( AWRK + I+(J-1)*NBA )
+                  SCALOC = DLARMM( ANRM, XNRM( KK ), BNRM )
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to X( I, KK ) and X( J, KK ).
+*
+                  SCAL = ( SCAMIN / WORK( I+KK*LDS) )*SCALOC
+                  IF( SCAL.NE.ONE ) THEN
+                     CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
+                     WORK( I+KK*LDS ) = SCAMIN*SCALOC
+                  END IF
+*
+                  SCAL = ( SCAMIN / WORK( J+KK*LDS ) )*SCALOC
+                  IF( SCAL.NE.ONE ) THEN
+                     CALL ZDSCAL( J2-J1, SCAL, X( J1, RHS ), 1 )
+                     WORK( J+KK*LDS ) = SCAMIN*SCALOC
+                  END IF
+               END DO
+*
+               IF( NOTRAN ) THEN
+*
+*                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K )
+*
+                  CALL ZGEMM( 'N', 'N', I2-I1, K2-K1, J2-J1, -CONE,
+     $                        A( I1, J1 ), LDA, X( J1, K1 ), LDX,
+     $                        CONE, X( I1, K1 ), LDX )
+               ELSE IF( LSAME( TRANS, 'T' ) ) THEN
+*
+*                 B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K )
+*
+                  CALL ZGEMM( 'T', 'N', I2-I1, K2-K1, J2-J1, -CONE,
+     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
+     $                        CONE, X( I1, K1 ), LDX )
+               ELSE
+*
+*                 B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K )
+*
+                  CALL ZGEMM( 'C', 'N', I2-I1, K2-K1, J2-J1, -CONE,
+     $                        A( J1, I1 ), LDA, X( J1, K1 ), LDX,
+     $                        CONE, X( I1, K1 ), LDX )
+               END IF
+            END DO
+         END DO
+
+*
+*        Reduce local scaling factors
+*
+         DO KK = 1, K2 - K1
+            RHS = K1 + KK - 1
+            DO I = 1, NBA
+               SCALE( RHS ) = MIN( SCALE( RHS ), WORK( I+KK*LDS ) )
+            END DO
+         END DO
+*
+*        Realize consistent scaling
+*
+         DO KK = 1, K2 - K1
+            RHS = K1 + KK - 1
+            IF( SCALE( RHS ).NE.ONE .AND. SCALE( RHS ).NE. ZERO ) THEN
+               DO I = 1, NBA
+                  I1 = (I - 1) * NB + 1
+                  I2 = MIN( I * NB, N ) + 1
+                  SCAL = SCALE( RHS ) / WORK( I+KK*LDS )
+                  IF( SCAL.NE.ONE )
+     $               CALL ZDSCAL( I2-I1, SCAL, X( I1, RHS ), 1 )
+               END DO
+            END IF
+         END DO
+      END DO
+      RETURN
+*
+*     End of ZLATRS3
+*
+      END
diff --git a/lapack-netlib/SRC/ztrsyl3.f b/lapack-netlib/SRC/ztrsyl3.f
new file mode 100644
index 000000000..b5a058da4
--- /dev/null
+++ b/lapack-netlib/SRC/ztrsyl3.f
@@ -0,0 +1,1142 @@
+*> \brief \b ZTRSYL3
+*
+* Definition:
+* ===========
+*
+*
+*>  \par Purpose
+*  =============
+*>
+*> \verbatim
+*>
+*>  ZTRSYL3 solves the complex Sylvester matrix equation:
+*>
+*>     op(A)*X + X*op(B) = scale*C or
+*>     op(A)*X - X*op(B) = scale*C,
+*>
+*>  where op(A) = A or A**H, and  A and B are both upper triangular. A is
+*>  M-by-M and B is N-by-N; the right hand side C and the solution X are
+*>  M-by-N; and scale is an output scale factor, set <= 1 to avoid
+*>  overflow in X.
+*>
+*>  This is the block version of the algorithm.
+*> \endverbatim
+*
+*  Arguments
+*  =========
+*
+*> \param[in] TRANA
+*> \verbatim
+*>          TRANA is CHARACTER*1
+*>          Specifies the option op(A):
+*>          = 'N': op(A) = A    (No transpose)
+*>          = 'C': op(A) = A**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] TRANB
+*> \verbatim
+*>          TRANB is CHARACTER*1
+*>          Specifies the option op(B):
+*>          = 'N': op(B) = B    (No transpose)
+*>          = 'C': op(B) = B**H (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] ISGN
+*> \verbatim
+*>          ISGN is INTEGER
+*>          Specifies the sign in the equation:
+*>          = +1: solve op(A)*X + X*op(B) = scale*C
+*>          = -1: solve op(A)*X - X*op(B) = scale*C
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The order of the matrix A, and the number of rows in the
+*>          matrices X and C. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The order of the matrix B, and the number of columns in the
+*>          matrices X and C. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,M)
+*>          The upper triangular matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,N)
+*>          The upper triangular matrix B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is COMPLEX*16 array, dimension (LDC,N)
+*>          On entry, the M-by-N right hand side matrix C.
+*>          On exit, C is overwritten by the solution matrix X.
+*> \endverbatim
+*>
+*> \param[in] LDC
+*> \verbatim
+*>          LDC is INTEGER
+*>          The leading dimension of the array C. LDC >= max(1,M)
+*> \endverbatim
+*>
+*> \param[out] SCALE
+*> \verbatim
+*>          SCALE is DOUBLE PRECISION
+*>          The scale factor, scale, set <= 1 to avoid overflow in X.
+*> \endverbatim
+*>
+*> \param[out] SWORK
+*> \verbatim
+*>          SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS),
+*>          MAX(1,COLS)).
+*>          On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS
+*>          and SWORK(2) returns the optimal COLS.
+*> \endverbatim
+*>
+*> \param[in] LDSWORK
+*> \verbatim
+*>          LDSWORK is INTEGER
+*>          LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1)
+*>          and NB is the optimal block size.
+*>
+*>          If LDSWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal dimensions of the SWORK matrix,
+*>          returns these values as the first and second entry of the SWORK
+*>          matrix, and no error message related LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0: successful exit
+*>          < 0: if INFO = -i, the i-th argument had an illegal value
+*>          = 1: A and B have common or very close eigenvalues; perturbed
+*>               values were used to solve the equation (but the matrices
+*>               A and B are unchanged).
+*> \endverbatim
+*
+*> \ingroup complex16SYcomputational
+*
+*  =====================================================================
+*  References:
+*   E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of
+*   algorithms: The triangular Sylvester equation, ACM Transactions
+*   on Mathematical Software (TOMS), volume 29, pages 218--243.
+*
+*   A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel
+*   Solution of the Triangular Sylvester Equation. Lecture Notes in
+*   Computer Science, vol 12043, pages 82--92, Springer.
+*
+*  Contributor:
+*   Angelika Schwarz, Umea University, Sweden.
+*
+*  =====================================================================
+      SUBROUTINE ZTRSYL3( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB, C,
+     $                    LDC, SCALE, SWORK, LDSWORK, INFO )
+      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+      CHARACTER          TRANA, TRANB
+      INTEGER            INFO, ISGN, LDA, LDB, LDC, LDSWORK, M, N
+      DOUBLE PRECISION   SCALE
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), C( LDC, * )
+      DOUBLE PRECISION   SWORK( LDSWORK, * )
+*     ..
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+      COMPLEX*16         CONE
+      PARAMETER          ( CONE = ( 1.0D0, 0.0D0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            NOTRNA, NOTRNB, LQUERY
+      INTEGER            AWRK, BWRK, I, I1, I2, IINFO, J, J1, J2, JJ,
+     $                   K, K1, K2, L, L1, L2, LL, NBA, NB, NBB
+      DOUBLE PRECISION   ANRM, BIGNUM, BNRM, CNRM, SCAL, SCALOC,
+     $                   SCAMIN, SGN, XNRM, BUF, SMLNUM
+      COMPLEX*16         CSGN
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   WNRM( MAX( M, N ) )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, DLARMM, ZLANGE
+      EXTERNAL           DLAMCH, DLARMM, ILAENV, LSAME, ZLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZDSCAL, ZGEMM, ZLASCL, ZTRSYL
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DIMAG, EXPONENT, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Decode and Test input parameters
+*
+      NOTRNA = LSAME( TRANA, 'N' )
+      NOTRNB = LSAME( TRANB, 'N' )
+*
+*     Use the same block size for all matrices.
+*
+      NB = MAX( 8, ILAENV( 1, 'ZTRSYL', '', M, N, -1, -1) )
+*
+*     Compute number of blocks in A and B
+*
+      NBA = MAX( 1, (M + NB - 1) / NB )
+      NBB = MAX( 1, (N + NB - 1) / NB )
+*
+*     Compute workspace
+*
+      INFO = 0
+      LQUERY = ( LDSWORK.EQ.-1 )
+      IF( LQUERY ) THEN
+         LDSWORK = 2
+         SWORK(1,1) = MAX( NBA, NBB )
+         SWORK(2,1) = 2 * NBB + NBA
+      END IF
+*
+*     Test the input arguments
+*
+      IF( .NOT.NOTRNA .AND. .NOT. LSAME( TRANA, 'C' ) ) THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRNB .AND. .NOT. LSAME( TRANB, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( ISGN.NE.1 .AND. ISGN.NE.-1 ) THEN
+         INFO = -3
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -7
+      ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
+         INFO = -9
+      ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
+         INFO = -11
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZTRSYL3', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      SCALE = ONE
+      IF( M.EQ.0 .OR. N.EQ.0 )
+     $   RETURN
+*
+*     Use unblocked code for small problems or if insufficient
+*     workspace is provided
+*
+      IF( MIN( NBA, NBB ).EQ.1 .OR. LDSWORK.LT.MAX( NBA, NBB ) ) THEN
+        CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, A, LDA, B, LDB,
+     $               C, LDC, SCALE, INFO )
+        RETURN
+      END IF
+*
+*     Set constants to control overflow
+*
+      SMLNUM = DLAMCH( 'S' )
+      BIGNUM = ONE / SMLNUM
+*
+*     Set local scaling factors.
+*
+      DO L = 1, NBB
+         DO K = 1, NBA
+            SWORK( K, L ) = ONE
+         END DO
+      END DO
+*
+*     Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero.
+*     This scaling is to ensure compatibility with TRSYL and may get flushed.
+*
+      BUF = ONE
+*
+*      Compute upper bounds of blocks of A and B
+*
+      AWRK = NBB
+      DO K = 1, NBA
+         K1 = (K - 1) * NB + 1
+         K2 = MIN( K * NB, M ) + 1
+         DO L = K, NBA
+            L1 = (L - 1) * NB + 1
+            L2 = MIN( L * NB, M ) + 1
+            IF( NOTRNA ) THEN
+               SWORK( K, AWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1,
+     $                                        A( K1, L1 ), LDA, WNRM )
+            ELSE
+               SWORK( L, AWRK + K ) = ZLANGE( '1', K2-K1, L2-L1,
+     $                                        A( K1, L1 ), LDA, WNRM )
+            END IF
+         END DO
+      END DO
+      BWRK = NBB + NBA
+      DO K = 1, NBB
+         K1 = (K - 1) * NB + 1
+         K2 = MIN( K * NB, N ) + 1
+         DO L = K, NBB
+            L1 = (L - 1) * NB + 1
+            L2 = MIN( L * NB, N ) + 1
+            IF( NOTRNB ) THEN
+               SWORK( K, BWRK + L ) = ZLANGE( 'I', K2-K1, L2-L1,
+     $                                        B( K1, L1 ), LDB, WNRM )
+            ELSE
+               SWORK( L, BWRK + K ) = ZLANGE( '1', K2-K1, L2-L1,
+     $                                        B( K1, L1 ), LDB, WNRM )
+            END IF
+         END DO
+      END DO
+*
+      SGN = DBLE( ISGN )
+      CSGN = DCMPLX( SGN, ZERO )
+*
+      IF( NOTRNA .AND. NOTRNB ) THEN
+*
+*        Solve    A*X + ISGN*X*B = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        bottom-left corner column by column by
+*
+*         A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L)
+*
+*        Where
+*                  M                         L-1
+*        R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)].
+*                I=K+1                       J=1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = NBA, 1, -1
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = (K - 1) * NB + 1
+            K2 = MIN( K * NB, M ) + 1
+            DO L = 1, NBB
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = (L - 1) * NB + 1
+               L2 = MIN( L * NB, N ) + 1
+*
+               CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K - 1, 1, -1
+*
+*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L )
+*
+                  I1 = (I - 1) * NB + 1
+                  I2 = MIN( I * NB, M ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = DLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L ).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                      DO JJ = L1, L2-1
+                         CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1)
+                      END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                      DO LL = L1, L2-1
+                         CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1)
+                      END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE,
+     $                        A( I1, K1 ), LDA, C( K1, L1 ), LDC,
+     $                        CONE, C( I1, L1 ), LDC )
+*
+               END DO
+*
+               DO J = L + 1, NBB
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J )
+*
+                  J1 = (J - 1) * NB + 1
+                  J2 = MIN( J * NB, N ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK(L, BWRK + J)
+                  SCALOC = DLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                      DO JJ = J1, J2-1
+                         CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                      END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN,
+     $                        C( K1, L1 ), LDC, B( L1, J1 ), LDB,
+     $                        CONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( .NOT.NOTRNA .AND. NOTRNB ) THEN
+*
+*        Solve    A**H *X + ISGN*X*B = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        upper-left corner column by column by
+*
+*          A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L)
+*
+*        Where
+*                   K-1                        L-1
+*          R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)]
+*                   I=1                        J=1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = 1, NBA
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = (K - 1) * NB + 1
+            K2 = MIN( K * NB, M ) + 1
+            DO L = 1, NBB
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = (L - 1) * NB + 1
+               L2 = MIN( L * NB, N ) + 1
+*
+               CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K + 1, NBA
+*
+*                 C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L )
+*
+                  I1 = (I - 1) * NB + 1
+                  I2 = MIN( I * NB, M ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = DLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to to C( I, L ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE,
+     $                        A( K1, I1 ), LDA, C( K1, L1 ), LDC,
+     $                        CONE, C( I1, L1 ), LDC )
+               END DO
+*
+               DO J = L + 1, NBB
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J )
+*
+                  J1 = (J - 1) * NB + 1
+                  J2 = MIN( J * NB, N ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = DLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to to C( K, J ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                      DO LL = L1, L2-1
+                         CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                      END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL ZGEMM( 'N', 'N', K2-K1, J2-J1, L2-L1, -CSGN,
+     $                        C( K1, L1 ), LDC, B( L1, J1 ), LDB,
+     $                        CONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( .NOT.NOTRNA .AND. .NOT.NOTRNB ) THEN
+*
+*        Solve    A**H *X + ISGN*X*B**H = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        top-right corner column by column by
+*
+*           A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L)
+*
+*        Where
+*                     K-1                          N
+*            R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H].
+*                     I=1                        J=L+1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = 1, NBA
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = (K - 1) * NB + 1
+            K2 = MIN( K * NB, M ) + 1
+            DO L = NBB, 1, -1
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = (L - 1) * NB + 1
+               L2 = MIN( L * NB, N ) + 1
+*
+               CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = K + 1, NBA
+*
+*                 C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L )
+*
+                  I1 = (I - 1) * NB + 1
+                  I2 = MIN( I * NB, M ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = DLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL ZGEMM( 'C', 'N', I2-I1, L2-L1, K2-K1, -CONE,
+     $                        A( K1, I1 ), LDA, C( K1, L1 ), LDC,
+     $                        CONE, C( I1, L1 ), LDC )
+               END DO
+*
+               DO J = 1, L - 1
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H
+*
+                  J1 = (J - 1) * NB + 1
+                  J2 = MIN( J * NB, N ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = DLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1)
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN,
+     $                        C( K1, L1 ), LDC, B( J1, L1 ), LDB,
+     $                        CONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+      ELSE IF( NOTRNA .AND. .NOT.NOTRNB ) THEN
+*
+*        Solve    A*X + ISGN*X*B**H = scale*C.
+*
+*        The (K,L)th block of X is determined starting from
+*        bottom-right corner column by column by
+*
+*            A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L)
+*
+*        Where
+*                      M                          N
+*            R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H].
+*                    I=K+1                      J=L+1
+*
+*        Start loop over block rows (index = K) and block columns (index = L)
+*
+         DO K = NBA, 1, -1
+*
+*           K1: row index of the first row in X( K, L )
+*           K2: row index of the first row in X( K+1, L )
+*           so the K2 - K1 is the column count of the block X( K, L )
+*
+            K1 = (K - 1) * NB + 1
+            K2 = MIN( K * NB, M ) + 1
+            DO L = NBB, 1, -1
+*
+*              L1: column index of the first column in X( K, L )
+*              L2: column index of the first column in X( K, L + 1)
+*              so that L2 - L1 is the row count of the block X( K, L )
+*
+               L1 = (L - 1) * NB + 1
+               L2 = MIN( L * NB, N ) + 1
+*
+               CALL ZTRSYL( TRANA, TRANB, ISGN, K2-K1, L2-L1,
+     $                      A( K1, K1 ), LDA,
+     $                      B( L1, L1 ), LDB,
+     $                      C( K1, L1 ), LDC, SCALOC, IINFO )
+               INFO = MAX( INFO, IINFO )
+*
+               IF( SCALOC * SWORK( K, L ) .EQ. ZERO ) THEN
+                  IF( SCALOC .EQ. ZERO ) THEN
+*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1)
+*                    is larger than the product of BIGNUM**2 and cannot be
+*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1).
+*                    Mark the computation as pointless.
+                     BUF = ZERO
+                  ELSE
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                  END IF
+                  DO JJ = 1, NBB
+                     DO LL = 1, NBA
+*                       Bound by BIGNUM to not introduce Inf. The value
+*                       is irrelevant; corresponding entries of the
+*                       solution will be flushed in consistency scaling.
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                     END DO
+                  END DO
+               END IF
+               SWORK( K, L ) = SCALOC * SWORK( K, L )
+               XNRM = ZLANGE( 'I', K2-K1, L2-L1, C( K1, L1 ), LDC,
+     $                        WNRM )
+*
+               DO I = 1, K - 1
+*
+*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L )
+*
+                  I1 = (I - 1) * NB + 1
+                  I2 = MIN( I * NB, M ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = ZLANGE( 'I', I2-I1, L2-L1, C( I1, L1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( I, L ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( I, L ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  ANRM = SWORK( I, AWRK + K )
+                  SCALOC = DLARMM( ANRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( I, L ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( I, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO LL = L1, L2-1
+                        CALL ZDSCAL( I2-I1, SCAL, C( I1, LL ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( I, L ) = SCAMIN * SCALOC
+*
+                  CALL ZGEMM( 'N', 'N', I2-I1, L2-L1, K2-K1, -CONE,
+     $                        A( I1, K1 ), LDA, C( K1, L1 ), LDC,
+     $                        CONE, C( I1, L1 ), LDC )
+*
+               END DO
+*
+               DO J = 1, L - 1
+*
+*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H
+*
+                  J1 = (J - 1) * NB + 1
+                  J2 = MIN( J * NB, N ) + 1
+*
+*                 Compute scaling factor to survive the linear update
+*                 simulating consistent scaling.
+*
+                  CNRM = ZLANGE( 'I', K2-K1, J2-J1, C( K1, J1 ),
+     $                           LDC, WNRM )
+                  SCAMIN = MIN( SWORK( K, J ), SWORK( K, L ) )
+                  CNRM = CNRM * ( SCAMIN / SWORK( K, J ) )
+                  XNRM = XNRM * ( SCAMIN / SWORK( K, L ) )
+                  BNRM = SWORK( L, BWRK + J )
+                  SCALOC = DLARMM( BNRM, XNRM, CNRM )
+                  IF( SCALOC * SCAMIN .EQ. ZERO ) THEN
+*                    Use second scaling factor to prevent flushing to zero.
+                     BUF = BUF*2.D0**EXPONENT( SCALOC )
+                     DO JJ = 1, NBB
+                        DO LL = 1, NBA
+                        SWORK( LL, JJ ) = MIN( BIGNUM,
+     $                     SWORK( LL, JJ ) / 2.D0**EXPONENT( SCALOC ) )
+                        END DO
+                     END DO
+                     SCAMIN = SCAMIN / 2.D0**EXPONENT( SCALOC )
+                     SCALOC = SCALOC / 2.D0**EXPONENT( SCALOC )
+                  END IF
+                  CNRM = CNRM * SCALOC
+                  XNRM = XNRM * SCALOC
+*
+*                 Simultaneously apply the robust update factor and the
+*                 consistency scaling factor to C( K, J ) and C( K, L).
+*
+                  SCAL = ( SCAMIN / SWORK( K, L ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = L1, L2-1
+                        CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+                  SCAL = ( SCAMIN / SWORK( K, J ) ) * SCALOC
+                  IF( SCAL .NE. ONE ) THEN
+                     DO JJ = J1, J2-1
+                        CALL ZDSCAL( K2-K1, SCAL, C( K1, JJ ), 1 )
+                     END DO
+                  ENDIF
+*
+*                 Record current scaling factor
+*
+                  SWORK( K, L ) = SCAMIN * SCALOC
+                  SWORK( K, J ) = SCAMIN * SCALOC
+*
+                  CALL ZGEMM( 'N', 'C', K2-K1, J2-J1, L2-L1, -CSGN,
+     $                        C( K1, L1 ), LDC, B( J1, L1 ), LDB,
+     $                        CONE, C( K1, J1 ), LDC )
+               END DO
+            END DO
+         END DO
+*
+      END IF
+*
+*     Reduce local scaling factors
+*
+      SCALE = SWORK( 1, 1 )
+      DO K = 1, NBA
+         DO L = 1, NBB
+            SCALE = MIN( SCALE, SWORK( K, L ) )
+         END DO
+      END DO
+      IF( SCALE .EQ. ZERO ) THEN
+*
+*        The magnitude of the largest entry of the solution is larger
+*        than the product of BIGNUM**2 and cannot be represented in the
+*        form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to
+*        zero and give up.
+*
+         SWORK(1,1) = MAX( NBA, NBB )
+         SWORK(2,1) = 2 * NBB + NBA
+         RETURN
+      END IF
+*
+*     Realize consistent scaling
+*
+      DO K = 1, NBA
+         K1 = (K - 1) * NB + 1
+         K2 = MIN( K * NB, M ) + 1
+         DO L = 1, NBB
+            L1 = (L - 1) * NB + 1
+            L2 = MIN( L * NB, N ) + 1
+            SCAL = SCALE / SWORK( K, L )
+            IF( SCAL .NE. ONE ) THEN
+               DO LL = L1, L2-1
+                  CALL ZDSCAL( K2-K1, SCAL, C( K1, LL ), 1 )
+               END DO
+            ENDIF
+         END DO
+      END DO
+*
+      IF( BUF .NE. ONE .AND. BUF.GT.ZERO ) THEN
+*
+*        Decrease SCALE as much as possible.
+*
+         SCALOC = MIN( SCALE / SMLNUM, ONE / BUF )
+         BUF = BUF * SCALOC
+         SCALE = SCALE / SCALOC
+      END IF
+*
+      IF( BUF.NE.ONE .AND. BUF.GT.ZERO ) THEN
+*
+*        In case of overly aggressive scaling during the computation,
+*        flushing of the global scale factor may be prevented by
+*        undoing some of the scaling. This step is to ensure that
+*        this routine flushes only scale factors that TRSYL also
+*        flushes and be usable as a drop-in replacement.
+*
+*        How much can the normwise largest entry be upscaled?
+*
+         SCAL = MAX( ABS( DBLE( C( 1, 1 ) ) ),
+     $               ABS( DIMAG( C ( 1, 1 ) ) ) )
+         DO K = 1, M
+            DO L = 1, N
+               SCAL = MAX( SCAL, ABS( DBLE ( C( K, L ) ) ),
+     $                     ABS( DIMAG ( C( K, L ) ) ) )
+            END DO
+         END DO
+*
+*        Increase BUF as close to 1 as possible and apply scaling.
+*
+         SCALOC = MIN( BIGNUM / SCAL, ONE / BUF )
+         BUF = BUF * SCALOC
+         CALL ZLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IINFO )
+      END IF
+*
+*     Combine with buffer scaling factor. SCALE will be flushed if
+*     BUF is less than one here.
+*
+      SCALE = SCALE * BUF
+*
+*     Restore workspace dimensions
+*
+      SWORK(1,1) = MAX( NBA, NBB )
+      SWORK(2,1) = 2 * NBB + NBA
+*
+      RETURN
+*
+*     End of ZTRSYL3
+*
+      END

From 92174725d90916a1942030d4afa4ef2f6a9e8a0c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 23:16:12 +0100
Subject: [PATCH 087/154] Add a BLAS3-based triangular Sylvester equation
 solver (Reference-LAPACK PR 651)

---
 lapack-netlib/TESTING/EIG/CMakeLists.txt |  11 +-
 lapack-netlib/TESTING/EIG/Makefile       |  24 +-
 lapack-netlib/TESTING/EIG/cchkec.f       |  42 +++-
 lapack-netlib/TESTING/EIG/cerrec.f       |  43 +++-
 lapack-netlib/TESTING/EIG/csyl01.f       | 294 +++++++++++++++++++++++
 lapack-netlib/TESTING/EIG/dchkec.f       |  46 +++-
 lapack-netlib/TESTING/EIG/derrec.f       |  41 +++-
 lapack-netlib/TESTING/EIG/dsyl01.f       | 288 ++++++++++++++++++++++
 lapack-netlib/TESTING/EIG/schkec.f       |  46 +++-
 lapack-netlib/TESTING/EIG/serrec.f       |  41 +++-
 lapack-netlib/TESTING/EIG/ssyl01.f       | 288 ++++++++++++++++++++++
 lapack-netlib/TESTING/EIG/zchkec.f       |  42 +++-
 lapack-netlib/TESTING/EIG/zerrec.f       |  41 +++-
 lapack-netlib/TESTING/EIG/zsyl01.f       | 294 +++++++++++++++++++++++
 14 files changed, 1468 insertions(+), 73 deletions(-)
 create mode 100644 lapack-netlib/TESTING/EIG/csyl01.f
 create mode 100644 lapack-netlib/TESTING/EIG/dsyl01.f
 create mode 100644 lapack-netlib/TESTING/EIG/ssyl01.f
 create mode 100644 lapack-netlib/TESTING/EIG/zsyl01.f

diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt
index 226004a90..3c8d9a8b2 100644
--- a/lapack-netlib/TESTING/EIG/CMakeLists.txt
+++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt
@@ -40,7 +40,7 @@ set(SEIGTST schkee.F
    sget54.f sglmts.f sgqrts.f sgrqts.f sgsvts3.f
    shst01.f slarfy.f slarhs.f slatm4.f slctes.f slctsx.f slsets.f sort01.f
    sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f
-   sstt22.f ssyt21.f ssyt22.f)
+   sstt22.f ssyl01.f ssyt21.f ssyt22.f)
 
 set(CEIGTST cchkee.F
    cbdt01.f cbdt02.f cbdt03.f cbdt05.f
@@ -56,7 +56,7 @@ set(CEIGTST cchkee.F
    cget54.f cglmts.f cgqrts.f cgrqts.f cgsvts3.f
    chbt21.f chet21.f chet22.f chpt21.f chst01.f
    clarfy.f clarhs.f clatm4.f clctes.f clctsx.f clsets.f csbmv.f
-   csgt01.f cslect.f
+   csgt01.f cslect.f csyl01.f
    cstt21.f cstt22.f cunt01.f cunt03.f)
 
 set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f
@@ -77,7 +77,7 @@ set(DEIGTST dchkee.F
    dget54.f dglmts.f dgqrts.f dgrqts.f dgsvts3.f
    dhst01.f dlarfy.f dlarhs.f dlatm4.f dlctes.f dlctsx.f dlsets.f dort01.f
    dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f
-   dstt22.f dsyt21.f dsyt22.f)
+   dstt22.f dsyl01.f dsyt21.f dsyt22.f)
 
 set(ZEIGTST zchkee.F
    zbdt01.f zbdt02.f zbdt03.f zbdt05.f
@@ -93,13 +93,12 @@ set(ZEIGTST zchkee.F
    zget54.f zglmts.f zgqrts.f zgrqts.f zgsvts3.f
    zhbt21.f zhet21.f zhet22.f zhpt21.f zhst01.f
    zlarfy.f zlarhs.f zlatm4.f zlctes.f zlctsx.f zlsets.f zsbmv.f
-   zsgt01.f zslect.f
+   zsgt01.f zslect.f zsyl01.f
    zstt21.f zstt22.f zunt01.f zunt03.f)
 
 macro(add_eig_executable name)
   add_executable(${name} ${ARGN})
-  target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
-#${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
+  target_link_libraries(${name} ${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 
 if(BUILD_SINGLE)
diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile
index bccfccf95..e40358663 100644
--- a/lapack-netlib/TESTING/EIG/Makefile
+++ b/lapack-netlib/TESTING/EIG/Makefile
@@ -62,7 +62,7 @@ SEIGTST = schkee.o \
    sget54.o sglmts.o sgqrts.o sgrqts.o sgsvts3.o \
    shst01.o slarfy.o slarhs.o slatm4.o slctes.o slctsx.o slsets.o sort01.o \
    sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \
-   sstt22.o ssyt21.o ssyt22.o
+   sstt22.o ssyl01.o ssyt21.o ssyt22.o
 
 CEIGTST = cchkee.o \
    cbdt01.o cbdt02.o cbdt03.o cbdt05.o \
@@ -78,7 +78,7 @@ CEIGTST = cchkee.o \
    cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts3.o \
    chbt21.o chet21.o chet22.o chpt21.o chst01.o \
    clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \
-   csgt01.o cslect.o \
+   csgt01.o cslect.o csyl01.o\
    cstt21.o cstt22.o cunt01.o cunt03.o
 
 DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \
@@ -99,7 +99,7 @@ DEIGTST = dchkee.o \
    dget54.o dglmts.o dgqrts.o dgrqts.o dgsvts3.o \
    dhst01.o dlarfy.o dlarhs.o dlatm4.o dlctes.o dlctsx.o dlsets.o dort01.o \
    dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \
-   dstt22.o dsyt21.o dsyt22.o
+   dstt22.o dsyl01.o dsyt21.o dsyt22.o
 
 ZEIGTST = zchkee.o \
    zbdt01.o zbdt02.o zbdt03.o zbdt05.o \
@@ -115,7 +115,7 @@ ZEIGTST = zchkee.o \
    zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts3.o \
    zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \
    zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \
-   zsgt01.o zslect.o \
+   zsgt01.o zslect.o zsyl01.o\
    zstt21.o zstt22.o zunt01.o zunt03.o
 
 .PHONY: all
@@ -127,17 +127,17 @@ complex: xeigtstc
 double: xeigtstd
 complex16: xeigtstz
 
-xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 $(AEIGTST): $(FRC)
 $(SCIGTST): $(FRC)
diff --git a/lapack-netlib/TESTING/EIG/cchkec.f b/lapack-netlib/TESTING/EIG/cchkec.f
index 6727a0954..c892b0a54 100644
--- a/lapack-netlib/TESTING/EIG/cchkec.f
+++ b/lapack-netlib/TESTING/EIG/cchkec.f
@@ -23,7 +23,7 @@
 *> \verbatim
 *>
 *> CCHKEC tests eigen- condition estimation routines
-*>        CTRSYL, CTREXC, CTRSNA, CTRSEN
+*>        CTRSYL, CTRSYL3, CTREXC, CTRSNA, CTRSEN
 *>
 *> In all cases, the routine runs through a fixed set of numerical
 *> examples, subjects them to various tests, and compares the test
@@ -88,17 +88,17 @@
 *     .. Local Scalars ..
       LOGICAL            OK
       CHARACTER*3        PATH
-      INTEGER            KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL,
-     $                   NTESTS, NTREXC, NTRSYL
-      REAL               EPS, RTREXC, RTRSYL, SFMIN
+      INTEGER            KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3,
+     $                   LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL
+      REAL               EPS, RTREXC, SFMIN
 *     ..
 *     .. Local Arrays ..
-      INTEGER            LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ),
-     $                   NTRSNA( 3 )
-      REAL               RTRSEN( 3 ), RTRSNA( 3 )
+      INTEGER            FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ),
+     $                   LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 )
+      REAL               RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CERREC, CGET35, CGET36, CGET37, CGET38
+      EXTERNAL           CERREC, CGET35, CGET36, CGET37, CGET38, CSYL01
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH
@@ -120,10 +120,24 @@
      $   CALL CERREC( PATH, NOUT )
 *
       OK = .TRUE.
-      CALL CGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN )
-      IF( RTRSYL.GT.THRESH ) THEN
+      CALL CGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN )
+      IF( RTRSYL( 1 ).GT.THRESH ) THEN
          OK = .FALSE.
-         WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL
+         WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL
+      END IF
+*
+      CALL CSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 )
+      IF( FTRSYL( 1 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH
+      END IF
+      IF( FTRSYL( 2 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH
+      END IF
+      IF( FTRSYL( 3 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9972 )FTRSYL( 3 )
       END IF
 *
       CALL CGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN )
@@ -169,6 +183,12 @@
      $      / ' Safe minimum (SFMIN)             = ', E16.6, / )
  9992 FORMAT( ' Routines pass computational tests if test ratio is ',
      $      'less than', F8.2, / / )
+ 9972 FORMAT( 'CTRSYL and CTRSYL3 compute an inconsistent scale ',
+     $      'factor in ', I8, ' tests.')
+ 9971 FORMAT( 'Error in CTRSYL3: ', I8, ' tests fail the threshold.', /
+     $      'Maximum test ratio =', D12.3, ' threshold =', D12.3 )
+ 9970 FORMAT( 'Error in CTRSYL: ', I8, ' tests fail the threshold.', /
+     $      'Maximum test ratio =', D12.3, ' threshold =', D12.3 )
       RETURN
 *
 *     End of CCHKEC
diff --git a/lapack-netlib/TESTING/EIG/cerrec.f b/lapack-netlib/TESTING/EIG/cerrec.f
index 650ab2b6e..6e2e1d38a 100644
--- a/lapack-netlib/TESTING/EIG/cerrec.f
+++ b/lapack-netlib/TESTING/EIG/cerrec.f
@@ -23,7 +23,7 @@
 *>
 *> CERREC tests the error exits for the routines for eigen- condition
 *> estimation for REAL matrices:
-*>    CTRSYL, CTREXC, CTRSNA and CTRSEN.
+*>    CTRSYL, CTRSYL3, CTREXC, CTRSNA and CTRSEN.
 *> \endverbatim
 *
 *  Arguments:
@@ -77,12 +77,12 @@
 *     ..
 *     .. Local Arrays ..
       LOGICAL            SEL( NMAX )
-      REAL               RW( LW ), S( NMAX ), SEP( NMAX )
+      REAL               RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX )
       COMPLEX            A( NMAX, NMAX ), B( NMAX, NMAX ),
      $                   C( NMAX, NMAX ), WORK( LW ), X( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL
+      EXTERNAL           CHKXER, CTREXC, CTRSEN, CTRSNA, CTRSYL, CTRSYL3
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -141,6 +141,43 @@
       CALL CHKXER( 'CTRSYL', INFOT, NOUT, LERR, OK )
       NT = NT + 8
 *
+*     Test CTRSYL3
+*
+      SRNAMT = 'CTRSYL3'
+      INFOT = 1
+      CALL CTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL CTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL CTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL CTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL CTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL CTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL CTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'CTRSYL3', INFOT, NOUT, LERR, OK )
+      NT = NT + 8
+*
 *     Test CTREXC
 *
       SRNAMT = 'CTREXC'
diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f
new file mode 100644
index 000000000..e21f1a7a0
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/csyl01.f
@@ -0,0 +1,294 @@
+*> \brief \b CSYL01
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT )
+*
+*     .. Scalar Arguments ..
+*     INTEGER            KNT
+*     REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+*     INTEGER            NFAIL( 3 ), NINFO( 2 )
+*     REAL               RMAX( 2 )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CSYL01 tests CTRSYL and CTRSYL3, routines for solving the Sylvester matrix
+*> equation
+*>
+*>    op(A)*X + ISGN*X*op(B) = scale*C,
+*>
+*> where op(A) and op(B) are both upper triangular form, op() represents an
+*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output
+*> less than or equal to 1, chosen to avoid overflow in X.
+*>
+*> The test code verifies that the following residual does not exceed
+*> the provided threshold:
+*>
+*>    norm(op(A)*X + ISGN*X*op(B) - scale*C) /
+*>        (EPS*max(norm(A),norm(B))*norm(X))
+*>
+*> This routine complements CGET35 by testing with larger,
+*> random matrices, of which some require rescaling of X to avoid overflow.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is REAL
+*>          A test will count as "failed" if the residual, computed as
+*>          described above, exceeds THRESH.
+*> \endverbatim
+*>
+*> \param[out] NFAIL
+*> \verbatim
+*>          NFAIL is INTEGER array, dimension (3)
+*>          NFAIL(1) = No. of times residual CTRSYL exceeds threshold THRESH
+*>          NFAIL(2) = No. of times residual CTRSYL3 exceeds threshold THRESH
+*>          NFAIL(3) = No. of times CTRSYL3 and CTRSYL deviate
+*> \endverbatim
+*>
+*> \param[out] RMAX
+*> \verbatim
+*>          RMAX is DOUBLE PRECISION array, dimension (2)
+*>          RMAX(1) = Value of the largest test ratio of CTRSYL
+*>          RMAX(2) = Value of the largest test ratio of CTRSYL3
+*> \endverbatim
+*>
+*> \param[out] NINFO
+*> \verbatim
+*>          NINFO is INTEGER array, dimension (2)
+*>          NINFO(1) = No. of times CTRSYL where INFO is nonzero
+*>          NINFO(2) = No. of times CTRSYL3 where INFO is nonzero
+*> \endverbatim
+*>
+*> \param[out] KNT
+*> \verbatim
+*>          KNT is INTEGER
+*>          Total number of examples tested.
+*> \endverbatim
+
+*
+*  -- LAPACK test routine --
+      SUBROUTINE CSYL01( THRESH, NFAIL, RMAX, NINFO, KNT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            KNT
+      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+      INTEGER            NFAIL( 3 ), NINFO( 2 )
+      REAL               RMAX( 2 )
+*     ..
+*
+*  =====================================================================
+*     ..
+*     .. Parameters ..
+      COMPLEX            CONE
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
+      REAL               ONE, ZERO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      INTEGER            MAXM, MAXN, LDSWORK
+      PARAMETER          ( MAXM = 101, MAXN = 138, LDSWORK = 18 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          TRANA, TRANB
+      INTEGER            I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA,
+     $                   KUA, KLB, KUB, M, N
+      REAL               ANRM, BNRM, BIGNUM, EPS, RES, RES1,
+     $                   SCALE, SCALE3, SMLNUM, TNRM, XNRM
+      COMPLEX            RMUL
+*     ..
+*     .. Local Arrays ..
+      COMPLEX            A( MAXM, MAXM ), B( MAXN, MAXN ),
+     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
+     $                   X( MAXM, MAXN ),
+     $                   DUML( MAXM ), DUMR( MAXN ),
+     $                   D( MIN( MAXM, MAXN ) )
+      REAL               SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 )
+      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      REAL               SLAMCH, CLANGE
+      EXTERNAL           SISNAN, SLAMCH, CLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLATMR, CLACPY, CGEMM, CTRSYL, CTRSYL3
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, REAL, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Get machine parameters
+*
+      EPS = SLAMCH( 'P' )
+      SMLNUM = SLAMCH( 'S' ) / EPS
+      BIGNUM = ONE / SMLNUM
+*
+*     Expect INFO = 0
+      VM( 1 ) = ONE
+*     Expect INFO = 1
+      VM( 2 ) = 0.5E+0
+*
+*     Begin test loop
+*
+      NINFO( 1 ) = 0
+      NINFO( 2 ) = 0
+      NFAIL( 1 ) = 0
+      NFAIL( 2 ) = 0
+      NFAIL( 3 ) = 0
+      RMAX( 1 ) = ZERO
+      RMAX( 2 ) = ZERO
+      KNT = 0
+      ISEED( 1 ) = 1
+      ISEED( 2 ) = 1
+      ISEED( 3 ) = 1
+      ISEED( 4 ) = 1
+      SCALE = ONE
+      SCALE3 = ONE
+      DO J = 1, 2
+         DO ISGN = -1, 1, 2
+*           Reset seed (overwritten by LATMR)
+            ISEED( 1 ) = 1
+            ISEED( 2 ) = 1
+            ISEED( 3 ) = 1
+            ISEED( 4 ) = 1
+            DO M = 32, MAXM, 23
+               KLA = 0
+               KUA = M - 1
+               CALL CLATMR( M, M, 'S', ISEED, 'N', D,
+     $                      6, ONE, CONE, 'T', 'N',
+     $                      DUML, 1, ONE, DUMR, 1, ONE,
+     $                      'N', IWORK, KLA, KUA, ZERO,
+     $                      ONE, 'NO', A, MAXM, IWORK,
+     $                      IINFO )
+               DO I = 1, M
+                  A( I, I ) = A( I, I ) * VM( J )
+               END DO
+               ANRM = CLANGE( 'M', M, M, A, MAXM, DUM )
+               DO N = 51, MAXN, 29
+                  KLB = 0
+                  KUB = N - 1
+                  CALL CLATMR( N, N, 'S', ISEED, 'N', D,
+     $                         6, ONE, CONE, 'T', 'N',
+     $                         DUML, 1, ONE, DUMR, 1, ONE,
+     $                         'N', IWORK, KLB, KUB, ZERO,
+     $                         ONE, 'NO', B, MAXN, IWORK,
+     $                         IINFO )
+                  DO I = 1, N
+                     B( I, I ) = B( I, I ) * VM ( J )
+                  END DO
+                  BNRM = CLANGE( 'M', N, N, B, MAXN, DUM )
+                  TNRM = MAX( ANRM, BNRM )
+                  CALL CLATMR( M, N, 'S', ISEED, 'N', D,
+     $                         6, ONE, CONE, 'T', 'N',
+     $                         DUML, 1, ONE, DUMR, 1, ONE,
+     $                         'N', IWORK, M, N, ZERO, ONE,
+     $                         'NO', C, MAXM, IWORK, IINFO )
+                  DO ITRANA = 1, 2
+                     IF( ITRANA.EQ.1 )
+     $                   TRANA = 'N'
+                     IF( ITRANA.EQ.2 )
+     $                   TRANA = 'C'
+                     DO ITRANB = 1, 2
+                        IF( ITRANB.EQ.1 )
+     $                     TRANB = 'N'
+                        IF( ITRANB.EQ.2 )
+     $                     TRANB = 'C'
+                        KNT = KNT + 1
+*
+                        CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM)
+                        CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM)
+                        CALL CTRSYL( TRANA, TRANB, ISGN, M, N, 
+     $                               A, MAXM, B, MAXN, X, MAXM,
+     $                               SCALE, IINFO )
+                        IF( IINFO.NE.0 )
+     $                     NINFO( 1 ) = NINFO( 1 ) + 1
+                        XNRM = CLANGE( 'M', M, N, X, MAXM, DUM )
+                        RMUL = CONE
+                        IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN
+                           IF( XNRM.GT.BIGNUM / TNRM ) THEN
+                              RMUL = CONE / MAX( XNRM, TNRM )
+                           END IF
+                        END IF
+                        CALL CGEMM( TRANA, 'N', M, N, M, RMUL,
+     $                              A, MAXM, X, MAXM, -SCALE*RMUL,
+     $                              CC, MAXM )
+                        CALL CGEMM( 'N', TRANB, M, N, N,
+     $                              REAL( ISGN )*RMUL, X, MAXM, B,
+     $                              MAXN, CONE, CC, MAXM )
+                        RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM )
+                        RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM,
+     $                        ( ( ABS( RMUL )*TNRM )*EPS )*XNRM )
+                        IF( RES.GT.THRESH )
+     $                     NFAIL( 1 ) = NFAIL( 1 ) + 1
+                        IF( RES.GT.RMAX( 1 ) )
+     $                     RMAX( 1 ) = RES
+*
+                        CALL CLACPY( 'All', M, N, C, MAXM, X, MAXM )
+                        CALL CLACPY( 'All', M, N, C, MAXM, CC, MAXM )
+                        CALL CTRSYL3( TRANA, TRANB, ISGN, M, N,
+     $                                A, MAXM, B, MAXN, X, MAXM,
+     $                                SCALE3, SWORK, LDSWORK, INFO)
+                        IF( INFO.NE.0 )
+     $                     NINFO( 2 ) = NINFO( 2 ) + 1
+                        XNRM = CLANGE( 'M', M, N, X, MAXM, DUM )
+                        RMUL = CONE
+                        IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN
+                           IF( XNRM.GT.BIGNUM / TNRM ) THEN
+                              RMUL = CONE / MAX( XNRM, TNRM )
+                           END IF
+                        END IF
+                        CALL CGEMM( TRANA, 'N', M, N, M, RMUL,
+     $                              A, MAXM, X, MAXM, -SCALE3*RMUL,
+     $                              CC, MAXM )
+                        CALL CGEMM( 'N', TRANB, M, N, N,
+     $                              REAL( ISGN )*RMUL, X, MAXM, B,
+     $                              MAXN, CONE, CC, MAXM )
+                        RES1 = CLANGE( 'M', M, N, CC, MAXM, DUM )
+                        RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM,
+     $                             ( ( ABS( RMUL )*TNRM )*EPS )*XNRM )
+*                       Verify that TRSYL3 only flushes if TRSYL flushes (but
+*                       there may be cases where TRSYL3 avoid flushing).
+                        IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. 
+     $                      IINFO.NE.INFO ) THEN
+                           NFAIL( 3 ) = NFAIL( 3 ) + 1
+                        END IF
+                        IF( RES.GT.THRESH .OR. SISNAN( RES ) )
+     $                     NFAIL( 2 ) = NFAIL( 2 ) + 1
+                        IF( RES.GT.RMAX( 2 ) )
+     $                     RMAX( 2 ) = RES
+                     END DO
+                  END DO
+               END DO
+            END DO
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of CSYL01
+*
+      END
diff --git a/lapack-netlib/TESTING/EIG/dchkec.f b/lapack-netlib/TESTING/EIG/dchkec.f
index 854961884..c4451a627 100644
--- a/lapack-netlib/TESTING/EIG/dchkec.f
+++ b/lapack-netlib/TESTING/EIG/dchkec.f
@@ -90,21 +90,23 @@
       LOGICAL            OK
       CHARACTER*3        PATH
       INTEGER            KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC,
-     $                   KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2,
-     $                   LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR,
-     $                   NLASY2, NTESTS, NTRSYL, KTGEXC, NTGEXC, LTGEXC
+     $                   KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC,
+     $                   LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL,
+     $                   NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC,
+     $                   LTGEXC
       DOUBLE PRECISION   EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2,
-     $                   RTREXC, RTRSYL, SFMIN, RTGEXC
+     $                   RTREXC, SFMIN, RTGEXC
 *     ..
 *     .. Local Arrays ..
-      INTEGER            LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ),
-     $                   NLALN2( 2 ), NTREXC( 3 ), NTRSEN( 3 ),
+      INTEGER            FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ),
+     $                   LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ),
+     $                   NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ),
      $                   NTRSNA( 3 )
-      DOUBLE PRECISION   RTRSEN( 3 ), RTRSNA( 3 )
+      DOUBLE PRECISION   RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 )
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DERREC, DGET31, DGET32, DGET33, DGET34, DGET35,
-     $                   DGET36, DGET37, DGET38, DGET39, DGET40
+     $                   DGET36, DGET37, DGET38, DGET39, DGET40, DSYL01
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH
@@ -153,10 +155,24 @@
          WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC
       END IF
 *
-      CALL DGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL )
-      IF( RTRSYL.GT.THRESH ) THEN
+      CALL DGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL )
+      IF( RTRSYL( 1 ).GT.THRESH ) THEN
          OK = .FALSE.
-         WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL
+         WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL
+      END IF
+*
+      CALL DSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 )
+      IF( FTRSYL( 1 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH
+      END IF
+      IF( FTRSYL( 2 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH
+      END IF
+      IF( FTRSYL( 3 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9972 )FTRSYL( 3 )
       END IF
 *
       CALL DGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN )
@@ -227,7 +243,13 @@
  9987 FORMAT( ' Routines pass computational tests if test ratio is les',
      $      's than', F8.2, / / )
  9986 FORMAT( ' Error in DTGEXC: RMAX =', D12.3, / ' LMAX = ', I8, ' N',
-     $      'INFO=', I8, ' KNT=', I8 )
+     $      'INFO=', 2I8, ' KNT=', I8 )
+ 9972 FORMAT( 'DTRSYL and DTRSYL3 compute an inconsistent result ',
+     $      'factor in ', I8, ' tests.')
+ 9971 FORMAT( 'Error in DTRSYL3: ', I8, ' tests fail the threshold.', /
+     $      'Maximum test ratio =', D12.3, ' threshold =', D12.3 )
+ 9970 FORMAT( 'Error in DTRSYL: ', I8, ' tests fail the threshold.', /
+     $      'Maximum test ratio =', D12.3, ' threshold =', D12.3 )
 *
 *     End of DCHKEC
 *
diff --git a/lapack-netlib/TESTING/EIG/derrec.f b/lapack-netlib/TESTING/EIG/derrec.f
index d5863ad42..f11f48887 100644
--- a/lapack-netlib/TESTING/EIG/derrec.f
+++ b/lapack-netlib/TESTING/EIG/derrec.f
@@ -23,7 +23,7 @@
 *>
 *> DERREC tests the error exits for the routines for eigen- condition
 *> estimation for DOUBLE PRECISION matrices:
-*>    DTRSYL, DTREXC, DTRSNA and DTRSEN.
+*>    DTRSYL, DTRSYL3, DTREXC, DTRSNA and DTRSEN.
 *> \endverbatim
 *
 *  Arguments:
@@ -82,7 +82,7 @@
      $                   WI( NMAX ), WORK( NMAX ), WR( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL
+      EXTERNAL           CHKXER, DTREXC, DTRSEN, DTRSNA, DTRSYL, DTRSYL3
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -141,6 +141,43 @@
       CALL CHKXER( 'DTRSYL', INFOT, NOUT, LERR, OK )
       NT = NT + 8
 *
+*     Test DTRSYL3
+*
+      SRNAMT = 'DTRSYL3'
+      INFOT = 1
+      CALL DTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL DTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL DTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL DTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL DTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL DTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL DTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'DTRSYL3', INFOT, NOUT, LERR, OK )
+      NT = NT + 8
+*
 *     Test DTREXC
 *
       SRNAMT = 'DTREXC'
diff --git a/lapack-netlib/TESTING/EIG/dsyl01.f b/lapack-netlib/TESTING/EIG/dsyl01.f
new file mode 100644
index 000000000..782d2cd42
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/dsyl01.f
@@ -0,0 +1,288 @@
+*> \brief \b DSYL01
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            KNT
+*       DOUBLE PRECISION   THRESH
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            NFAIL( 3 ), NINFO( 2 )
+*       DOUBLE PRECISION   RMAX( 2 )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DSYL01 tests DTRSYL and DTRSYL3, routines for solving the Sylvester matrix
+*> equation
+*>
+*>    op(A)*X + ISGN*X*op(B) = scale*C,
+*>
+*> A and B are assumed to be in Schur canonical form, op() represents an
+*> optional transpose, and ISGN can be -1 or +1.  Scale is an output
+*> less than or equal to 1, chosen to avoid overflow in X.
+*>
+*> The test code verifies that the following residual does not exceed
+*> the provided threshold:
+*>
+*>    norm(op(A)*X + ISGN*X*op(B) - scale*C) /
+*>        (EPS*max(norm(A),norm(B))*norm(X))
+*>
+*> This routine complements DGET35 by testing with larger,
+*> random matrices, of which some require rescaling of X to avoid overflow.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is DOUBLE PRECISION
+*>          A test will count as "failed" if the residual, computed as
+*>          described above, exceeds THRESH.
+*> \endverbatim
+*>
+*> \param[out] NFAIL
+*> \verbatim
+*>          NFAIL is INTEGER array, dimension (3)
+*>          NFAIL(1) = No. of times residual DTRSYL exceeds threshold THRESH
+*>          NFAIL(2) = No. of times residual DTRSYL3 exceeds threshold THRESH
+*>          NFAIL(3) = No. of times DTRSYL3 and DTRSYL deviate
+*> \endverbatim
+*>
+*> \param[out] RMAX
+*> \verbatim
+*>          RMAX is DOUBLE PRECISION, dimension (2)
+*>          RMAX(1) = Value of the largest test ratio of DTRSYL
+*>          RMAX(2) = Value of the largest test ratio of DTRSYL3
+*> \endverbatim
+*>
+*> \param[out] NINFO
+*> \verbatim
+*>          NINFO is INTEGER array, dimension (2)
+*>          NINFO(1) = No. of times DTRSYL returns an expected INFO
+*>          NINFO(2) = No. of times DTRSYL3 returns an expected INFO
+*> \endverbatim
+*>
+*> \param[out] KNT
+*> \verbatim
+*>          KNT is INTEGER
+*>          Total number of examples tested.
+*> \endverbatim
+
+*
+*  -- LAPACK test routine --
+      SUBROUTINE DSYL01( THRESH, NFAIL, RMAX, NINFO, KNT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            KNT
+      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+      INTEGER            NFAIL( 3 ), NINFO( 2 )
+      DOUBLE PRECISION   RMAX( 2 )
+*     ..
+*
+*  =====================================================================
+*     ..
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0 )
+      INTEGER            MAXM, MAXN, LDSWORK
+      PARAMETER          ( MAXM = 245, MAXN = 192, LDSWORK = 36 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          TRANA, TRANB
+      INTEGER            I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA,
+     $                   KUA, KLB, KUB, LIWORK, M, N
+      DOUBLE PRECISION   ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL,
+     $                   SCALE, SCALE3, SMLNUM, TNRM, XNRM
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   A( MAXM, MAXM ), B( MAXN, MAXN ),
+     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
+     $                   X( MAXM, MAXN ),
+     $                   DUML( MAXM ), DUMR( MAXN ),
+     $                   D( MAX( MAXM, MAXN ) ), DUM( MAXN ),
+     $                   SWORK( LDSWORK, 126 ), VM( 2 )
+      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 )
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      DOUBLE PRECISION   DLAMCH, DLANGE
+      EXTERNAL           DLAMCH, DLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLATMR, DLACPY, DGEMM, DTRSYL, DTRSYL3
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Get machine parameters
+*
+      EPS = DLAMCH( 'P' )
+      SMLNUM = DLAMCH( 'S' ) / EPS
+      BIGNUM = ONE / SMLNUM
+*
+      VM( 1 ) = ONE
+      VM( 2 ) = 0.000001D+0
+*
+*     Begin test loop
+*
+      NINFO( 1 ) = 0
+      NINFO( 2 ) = 0
+      NFAIL( 1 ) = 0
+      NFAIL( 2 ) = 0
+      NFAIL( 3 ) = 0
+      RMAX( 1 ) = ZERO
+      RMAX( 2 ) = ZERO
+      KNT = 0
+      DO I = 1, 4
+         ISEED( I ) = 1
+      END DO
+      SCALE = ONE
+      SCALE3 = ONE
+      LIWORK = MAXM + MAXN + 2
+      DO J = 1, 2
+         DO ISGN = -1, 1, 2
+*           Reset seed (overwritten by LATMR)
+            DO I = 1, 4
+               ISEED( I ) = 1
+            END DO
+            DO M = 32, MAXM, 71
+               KLA = 0
+               KUA = M - 1
+               CALL DLATMR( M, M, 'S', ISEED, 'N', D,
+     $                      6, ONE, ONE, 'T', 'N',
+     $                      DUML, 1, ONE, DUMR, 1, ONE,
+     $                      'N', IWORK, KLA, KUA, ZERO,
+     $                      ONE, 'NO', A, MAXM, IWORK, IINFO )
+               DO I = 1, M
+                  A( I, I ) = A( I, I ) * VM( J )
+               END DO
+               ANRM = DLANGE( 'M', M, M, A, MAXM, DUM )
+               DO N = 51, MAXN, 47
+                  KLB = 0
+                  KUB = N - 1
+                  CALL DLATMR( N, N, 'S', ISEED, 'N', D,
+     $                         6, ONE, ONE, 'T', 'N',
+     $                         DUML, 1, ONE, DUMR, 1, ONE,
+     $                         'N', IWORK, KLB, KUB, ZERO,
+     $                         ONE, 'NO', B, MAXN, IWORK, IINFO )
+                  BNRM = DLANGE( 'M', N, N, B, MAXN, DUM )
+                  TNRM = MAX( ANRM, BNRM )
+                  CALL DLATMR( M, N, 'S', ISEED, 'N', D,
+     $                         6, ONE, ONE, 'T', 'N',
+     $                         DUML, 1, ONE, DUMR, 1, ONE,
+     $                         'N', IWORK, M, N, ZERO, ONE,
+     $                         'NO', C, MAXM, IWORK, IINFO )
+                  DO ITRANA = 1, 2
+                     IF( ITRANA.EQ.1 ) THEN
+                        TRANA = 'N'
+                     END IF
+                     IF( ITRANA.EQ.2 ) THEN
+                        TRANA = 'T'
+                     END IF
+                     DO ITRANB = 1, 2
+                        IF( ITRANB.EQ.1 ) THEN
+                           TRANB = 'N'
+                        END IF
+                        IF( ITRANB.EQ.2 ) THEN
+                           TRANB = 'T'
+                        END IF
+                        KNT = KNT + 1
+*
+                        CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM)
+                        CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM)
+                        CALL DTRSYL( TRANA, TRANB, ISGN, M, N, 
+     $                               A, MAXM, B, MAXN, X, MAXM,
+     $                               SCALE, IINFO )
+                        IF( IINFO.NE.0 )
+     $                     NINFO( 1 ) = NINFO( 1 ) + 1
+                        XNRM = DLANGE( 'M', M, N, X, MAXM, DUM )
+                        RMUL = ONE
+                        IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN
+                           IF( XNRM.GT.BIGNUM / TNRM ) THEN
+                              RMUL = ONE / MAX( XNRM, TNRM )
+                           END IF
+                        END IF
+                        CALL DGEMM( TRANA, 'N', M, N, M, RMUL,
+     $                              A, MAXM, X, MAXM, -SCALE*RMUL,
+     $                              CC, MAXM )
+                        CALL DGEMM( 'N', TRANB, M, N, N,
+     $                               DBLE( ISGN )*RMUL, X, MAXM, B,
+     $                               MAXN, ONE, CC, MAXM )
+                        RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM )
+                        RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM,
+     $                              ( ( RMUL*TNRM )*EPS )*XNRM )
+                        IF( RES.GT.THRESH )
+     $                     NFAIL( 1 ) = NFAIL( 1 ) + 1
+                        IF( RES.GT.RMAX( 1 ) )
+     $                     RMAX( 1 ) = RES
+*
+                        CALL DLACPY( 'All', M, N, C, MAXM, X, MAXM )
+                        CALL DLACPY( 'All', M, N, C, MAXM, CC, MAXM )
+                        CALL DTRSYL3( TRANA, TRANB, ISGN, M, N,
+     $                                A, MAXM, B, MAXN, X, MAXM,
+     $                                SCALE3, IWORK, LIWORK,
+     $                                SWORK, LDSWORK, INFO)
+                        IF( INFO.NE.0 )
+     $                     NINFO( 2 ) = NINFO( 2 ) + 1
+                        XNRM = DLANGE( 'M', M, N, X, MAXM, DUM )
+                        RMUL = ONE
+                        IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN
+                           IF( XNRM.GT.BIGNUM / TNRM ) THEN
+                              RMUL = ONE / MAX( XNRM, TNRM )
+                           END IF
+                        END IF
+                        CALL DGEMM( TRANA, 'N', M, N, M, RMUL,
+     $                              A, MAXM, X, MAXM, -SCALE3*RMUL,
+     $                              CC, MAXM )
+                        CALL DGEMM( 'N', TRANB, M, N, N,
+     $                              DBLE( ISGN )*RMUL, X, MAXM, B,
+     $                              MAXN, ONE, CC, MAXM )
+                        RES1 = DLANGE( 'M', M, N, CC, MAXM, DUM )
+                        RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM,
+     $                             ( ( RMUL*TNRM )*EPS )*XNRM )
+*                       Verify that TRSYL3 only flushes if TRSYL flushes (but
+*                       there may be cases where TRSYL3 avoid flushing).
+                        IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. 
+     $                      IINFO.NE.INFO ) THEN
+                           NFAIL( 3 ) = NFAIL( 3 ) + 1
+                        END IF
+                        IF( RES.GT.THRESH .OR. DISNAN( RES ) )
+     $                     NFAIL( 2 ) = NFAIL( 2 ) + 1
+                        IF( RES.GT.RMAX( 2 ) )
+     $                     RMAX( 2 ) = RES
+                     END DO
+                  END DO
+               END DO
+            END DO
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of DSYL01
+*
+      END
diff --git a/lapack-netlib/TESTING/EIG/schkec.f b/lapack-netlib/TESTING/EIG/schkec.f
index e6123e1ad..59abb2466 100644
--- a/lapack-netlib/TESTING/EIG/schkec.f
+++ b/lapack-netlib/TESTING/EIG/schkec.f
@@ -90,21 +90,23 @@
       LOGICAL            OK
       CHARACTER*3        PATH
       INTEGER            KLAEXC, KLALN2, KLANV2, KLAQTR, KLASY2, KTREXC,
-     $                   KTRSEN, KTRSNA, KTRSYL, LLAEXC, LLALN2, LLANV2,
-     $                   LLAQTR, LLASY2, LTREXC, LTRSYL, NLANV2, NLAQTR,
-     $                   NLASY2, NTESTS, NTRSYL, KTGEXC, NTGEXC, LTGEXC
+     $                   KTRSEN, KTRSNA, KTRSYL, KTRSYL3, LLAEXC,
+     $                   LLALN2, LLANV2, LLAQTR, LLASY2, LTREXC, LTRSYL,
+     $                   NLANV2, NLAQTR, NLASY2, NTESTS, NTRSYL, KTGEXC,
+     $                   LTGEXC
       REAL               EPS, RLAEXC, RLALN2, RLANV2, RLAQTR, RLASY2,
-     $                   RTREXC, RTRSYL, SFMIN, RTGEXC
+     $                   RTREXC, SFMIN, RTGEXC
 *     ..
 *     .. Local Arrays ..
-      INTEGER            LTRSEN( 3 ), LTRSNA( 3 ), NLAEXC( 2 ),
-     $                   NLALN2( 2 ), NTREXC( 3 ), NTRSEN( 3 ),
+      INTEGER            FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ),
+     $                   LTRSNA( 3 ), NLAEXC( 2 ), NLALN2( 2 ),
+     $                   NTGEXC( 2 ), NTREXC( 3 ), NTRSEN( 3 ),
      $                   NTRSNA( 3 )
-      REAL               RTRSEN( 3 ), RTRSNA( 3 )
+      REAL               RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 )
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SERREC, SGET31, SGET32, SGET33, SGET34, SGET35,
-     $                   SGET36, SGET37, SGET38, SGET39, SGET40
+     $                   SGET36, SGET37, SGET38, SGET39, SGET40, SSYL01
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH
@@ -153,10 +155,24 @@
          WRITE( NOUT, FMT = 9996 )RLAEXC, LLAEXC, NLAEXC, KLAEXC
       END IF
 *
-      CALL SGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL )
-      IF( RTRSYL.GT.THRESH ) THEN
+      CALL SGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL )
+      IF( RTRSYL( 1 ).GT.THRESH ) THEN
          OK = .FALSE.
-         WRITE( NOUT, FMT = 9995 )RTRSYL, LTRSYL, NTRSYL, KTRSYL
+         WRITE( NOUT, FMT = 9995 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL
+      END IF
+*
+      CALL SSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 )
+      IF( FTRSYL( 1 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH
+      END IF
+      IF( FTRSYL( 2 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH
+      END IF
+      IF( FTRSYL( 3 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9972 )FTRSYL( 3 )
       END IF
 *
       CALL SGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN )
@@ -227,7 +243,13 @@
  9987 FORMAT( ' Routines pass computational tests if test ratio is les',
      $      's than', F8.2, / / )
  9986 FORMAT( ' Error in STGEXC: RMAX =', E12.3, / ' LMAX = ', I8, ' N',
-     $      'INFO=', I8, ' KNT=', I8 )
+     $      'INFO=', 2I8, ' KNT=', I8 )
+ 9972 FORMAT( 'STRSYL and STRSYL3 compute an inconsistent result ',
+     $      'factor in ', I8, ' tests.')
+ 9971 FORMAT( 'Error in STRSYL3: ', I8, ' tests fail the threshold.', /
+     $      'Maximum test ratio =', D12.3, ' threshold =', D12.3 )
+ 9970 FORMAT( 'Error in STRSYL: ', I8, ' tests fail the threshold.', /
+     $      'Maximum test ratio =', D12.3, ' threshold =', D12.3 )
 *
 *     End of SCHKEC
 *
diff --git a/lapack-netlib/TESTING/EIG/serrec.f b/lapack-netlib/TESTING/EIG/serrec.f
index 249f0e642..9a7ceb362 100644
--- a/lapack-netlib/TESTING/EIG/serrec.f
+++ b/lapack-netlib/TESTING/EIG/serrec.f
@@ -23,7 +23,7 @@
 *>
 *> SERREC tests the error exits for the routines for eigen- condition
 *> estimation for REAL matrices:
-*>    STRSYL, STREXC, STRSNA and STRSEN.
+*>    STRSYL, STRSYL3, STREXC, STRSNA and STRSEN.
 *> \endverbatim
 *
 *  Arguments:
@@ -82,7 +82,7 @@
      $                   WI( NMAX ), WORK( NMAX ), WR( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CHKXER, STREXC, STRSEN, STRSNA, STRSYL
+      EXTERNAL           CHKXER, STREXC, STRSEN, STRSNA, STRSYL, STRSYL3
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -141,6 +141,43 @@
       CALL CHKXER( 'STRSYL', INFOT, NOUT, LERR, OK )
       NT = NT + 8
 *
+*     Test STRSYL3
+*
+      SRNAMT = 'STRSYL3'
+      INFOT = 1
+      CALL STRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL STRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL STRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL STRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL STRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL STRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL STRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE,
+     $              IWORK, NMAX, WORK, NMAX, INFO )
+      CALL CHKXER( 'STRSYL3', INFOT, NOUT, LERR, OK )
+      NT = NT + 8
+*
 *     Test STREXC
 *
       SRNAMT = 'STREXC'
diff --git a/lapack-netlib/TESTING/EIG/ssyl01.f b/lapack-netlib/TESTING/EIG/ssyl01.f
new file mode 100644
index 000000000..22d089dc8
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/ssyl01.f
@@ -0,0 +1,288 @@
+*> \brief \b SSYL01
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT )
+*
+*      .. Scalar Arguments ..
+*      INTEGER            KNT
+*      REAL               THRESH
+*      ..
+*      .. Array Arguments ..
+*      INTEGER            NFAIL( 3 ), NINFO( 2 )
+*      REAL               RMAX( 2 )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SSYL01 tests STRSYL and STRSYL3, routines for solving the Sylvester matrix
+*> equation
+*>
+*>    op(A)*X + ISGN*X*op(B) = scale*C,
+*>
+*> A and B are assumed to be in Schur canonical form, op() represents an
+*> optional transpose, and ISGN can be -1 or +1.  Scale is an output
+*> less than or equal to 1, chosen to avoid overflow in X.
+*>
+*> The test code verifies that the following residual does not exceed
+*> the provided threshold:
+*>
+*>    norm(op(A)*X + ISGN*X*op(B) - scale*C) /
+*>        (EPS*max(norm(A),norm(B))*norm(X))
+*>
+*> This routine complements SGET35 by testing with larger,
+*> random matrices, of which some require rescaling of X to avoid overflow.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is REAL
+*>          A test will count as "failed" if the residual, computed as
+*>          described above, exceeds THRESH.
+*> \endverbatim
+*>
+*> \param[out] NFAIL
+*> \verbatim
+*>          NFAIL is INTEGER array, dimension (3)
+*>          NFAIL(1) = No. of times residual STRSYL exceeds threshold THRESH
+*>          NFAIL(2) = No. of times residual STRSYL3 exceeds threshold THRESH
+*>          NFAIL(3) = No. of times STRSYL3 and STRSYL deviate
+*> \endverbatim
+*>
+*> \param[out] RMAX
+*> \verbatim
+*>          RMAX is REAL, dimension (2)
+*>          RMAX(1) = Value of the largest test ratio of STRSYL
+*>          RMAX(2) = Value of the largest test ratio of STRSYL3
+*> \endverbatim
+*>
+*> \param[out] NINFO
+*> \verbatim
+*>          NINFO is INTEGER array, dimension (2)
+*>          NINFO(1) = No. of times STRSYL returns an expected INFO
+*>          NINFO(2) = No. of times STRSYL3 returns an expected INFO
+*> \endverbatim
+*>
+*> \param[out] KNT
+*> \verbatim
+*>          KNT is INTEGER
+*>          Total number of examples tested.
+*> \endverbatim
+
+*
+*  -- LAPACK test routine --
+      SUBROUTINE SSYL01( THRESH, NFAIL, RMAX, NINFO, KNT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            KNT
+      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+      INTEGER            NFAIL( 3 ), NINFO( 2 )
+      REAL               RMAX( 2 )
+*     ..
+*
+*  =====================================================================
+*     ..
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      INTEGER            MAXM, MAXN, LDSWORK
+      PARAMETER          ( MAXM = 101, MAXN = 138, LDSWORK = 18 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          TRANA, TRANB
+      INTEGER            I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA,
+     $                   KUA, KLB, KUB, LIWORK, M, N
+      REAL               ANRM, BNRM, BIGNUM, EPS, RES, RES1, RMUL,
+     $                   SCALE, SCALE3, SMLNUM, TNRM, XNRM
+*     ..
+*     .. Local Arrays ..
+      REAL               A( MAXM, MAXM ), B( MAXN, MAXN ),
+     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
+     $                   X( MAXM, MAXN ),
+     $                   DUML( MAXM ), DUMR( MAXN ),
+     $                   D( MAX( MAXM, MAXN ) ), DUM( MAXN ),
+     $                   SWORK( LDSWORK, 54 ), VM( 2 )
+      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 )
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      REAL               SLAMCH, SLANGE
+      EXTERNAL           SISNAN, SLAMCH, SLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLATMR, SLACPY, SGEMM, STRSYL, STRSYL3
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, REAL, MAX
+*     ..
+*     .. Executable Statements ..
+*
+*     Get machine parameters
+*
+      EPS = SLAMCH( 'P' )
+      SMLNUM = SLAMCH( 'S' ) / EPS
+      BIGNUM = ONE / SMLNUM
+*
+      VM( 1 ) = ONE
+      VM( 2 ) = 0.05E+0
+*
+*     Begin test loop
+*
+      NINFO( 1 ) = 0
+      NINFO( 2 ) = 0
+      NFAIL( 1 ) = 0
+      NFAIL( 2 ) = 0
+      NFAIL( 3 ) = 0
+      RMAX( 1 ) = ZERO
+      RMAX( 2 ) = ZERO
+      KNT = 0
+      DO I = 1, 4
+         ISEED( I ) = 1
+      END DO
+      SCALE = ONE
+      SCALE3 = ONE
+      LIWORK = MAXM + MAXN + 2
+      DO J = 1, 2
+         DO ISGN = -1, 1, 2
+*           Reset seed (overwritten by LATMR)
+            DO I = 1, 4
+               ISEED( I ) = 1
+            END DO
+            DO M = 32, MAXM, 71
+               KLA = 0
+               KUA = M - 1
+               CALL SLATMR( M, M, 'S', ISEED, 'N', D,
+     $                      6, ONE, ONE, 'T', 'N',
+     $                      DUML, 1, ONE, DUMR, 1, ONE,
+     $                      'N', IWORK, KLA, KUA, ZERO,
+     $                      ONE, 'NO', A, MAXM, IWORK, IINFO )
+               DO I = 1, M
+                  A( I, I ) = A( I, I ) * VM( J )
+               END DO
+               ANRM = SLANGE( 'M', M, M, A, MAXM, DUM )
+               DO N = 51, MAXN, 47
+                  KLB = 0
+                  KUB = N - 1
+                  CALL SLATMR( N, N, 'S', ISEED, 'N', D,
+     $                         6, ONE, ONE, 'T', 'N',
+     $                         DUML, 1, ONE, DUMR, 1, ONE,
+     $                         'N', IWORK, KLB, KUB, ZERO,
+     $                         ONE, 'NO', B, MAXN, IWORK, IINFO )
+                  BNRM = SLANGE( 'M', N, N, B, MAXN, DUM )
+                  TNRM = MAX( ANRM, BNRM )
+                  CALL SLATMR( M, N, 'S', ISEED, 'N', D,
+     $                         6, ONE, ONE, 'T', 'N',
+     $                         DUML, 1, ONE, DUMR, 1, ONE,
+     $                         'N', IWORK, M, N, ZERO, ONE,
+     $                         'NO', C, MAXM, IWORK, IINFO )
+                  DO ITRANA = 1, 2
+                     IF( ITRANA.EQ.1 ) THEN
+                        TRANA = 'N'
+                     END IF
+                     IF( ITRANA.EQ.2 ) THEN
+                        TRANA = 'T'
+                     END IF
+                     DO ITRANB = 1, 2
+                        IF( ITRANB.EQ.1 ) THEN
+                           TRANB = 'N'
+                        END IF
+                        IF( ITRANB.EQ.2 ) THEN
+                           TRANB = 'T'
+                        END IF
+                        KNT = KNT + 1
+*
+                        CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM)
+                        CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM)
+                        CALL STRSYL( TRANA, TRANB, ISGN, M, N, 
+     $                               A, MAXM, B, MAXN, X, MAXM,
+     $                               SCALE, IINFO )
+                        IF( IINFO.NE.0 )
+     $                     NINFO( 1 ) = NINFO( 1 ) + 1
+                        XNRM = SLANGE( 'M', M, N, X, MAXM, DUM )
+                        RMUL = ONE
+                        IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN
+                           IF( XNRM.GT.BIGNUM / TNRM ) THEN
+                              RMUL = ONE / MAX( XNRM, TNRM )
+                           END IF
+                        END IF
+                        CALL SGEMM( TRANA, 'N', M, N, M, RMUL,
+     $                              A, MAXM, X, MAXM, -SCALE*RMUL,
+     $                              C, MAXM )
+                        CALL SGEMM( 'N', TRANB, M, N, N,
+     $                               REAL( ISGN )*RMUL, X, MAXM, B,
+     $                               MAXN, ONE, C, MAXM )
+                        RES1 = SLANGE( 'M', M, N, C, MAXM, DUM )
+                        RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM,
+     $                              ( ( RMUL*TNRM )*EPS )*XNRM )
+                        IF( RES.GT.THRESH )
+     $                     NFAIL( 1 ) = NFAIL( 1 ) + 1
+                        IF( RES.GT.RMAX( 1 ) )
+     $                     RMAX( 1 ) = RES
+*
+                        CALL SLACPY( 'All', M, N, C, MAXM, X, MAXM )
+                        CALL SLACPY( 'All', M, N, C, MAXM, CC, MAXM )
+                        CALL STRSYL3( TRANA, TRANB, ISGN, M, N,
+     $                                A, MAXM, B, MAXN, X, MAXM,
+     $                                SCALE3, IWORK, LIWORK,
+     $                                SWORK, LDSWORK, INFO)
+                        IF( INFO.NE.0 )
+     $                     NINFO( 2 ) = NINFO( 2 ) + 1
+                        XNRM = SLANGE( 'M', M, N, X, MAXM, DUM )
+                        RMUL = ONE
+                        IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN
+                           IF( XNRM.GT.BIGNUM / TNRM ) THEN
+                              RMUL = ONE / MAX( XNRM, TNRM )
+                           END IF
+                        END IF
+                        CALL SGEMM( TRANA, 'N', M, N, M, RMUL,
+     $                              A, MAXM, X, MAXM, -SCALE3*RMUL,
+     $                              CC, MAXM )
+                        CALL SGEMM( 'N', TRANB, M, N, N,
+     $                              REAL( ISGN )*RMUL, X, MAXM, B,
+     $                              MAXN, ONE, CC, MAXM )
+                        RES1 = SLANGE( 'M', M, N, CC, MAXM, DUM )
+                        RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM,
+     $                             ( ( RMUL*TNRM )*EPS )*XNRM )
+*                       Verify that TRSYL3 only flushes if TRSYL flushes (but
+*                       there may be cases where TRSYL3 avoid flushing).
+                        IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR. 
+     $                      IINFO.NE.INFO ) THEN
+                           NFAIL( 3 ) = NFAIL( 3 ) + 1
+                        END IF
+                        IF( RES.GT.THRESH .OR. SISNAN( RES ) )
+     $                     NFAIL( 2 ) = NFAIL( 2 ) + 1
+                        IF( RES.GT.RMAX( 2 ) )
+     $                     RMAX( 2 ) = RES
+                     END DO
+                  END DO
+               END DO
+            END DO
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of SSYL01
+*
+      END
diff --git a/lapack-netlib/TESTING/EIG/zchkec.f b/lapack-netlib/TESTING/EIG/zchkec.f
index 1e1c29e0d..62a76d357 100644
--- a/lapack-netlib/TESTING/EIG/zchkec.f
+++ b/lapack-netlib/TESTING/EIG/zchkec.f
@@ -88,17 +88,17 @@
 *     .. Local Scalars ..
       LOGICAL            OK
       CHARACTER*3        PATH
-      INTEGER            KTREXC, KTRSEN, KTRSNA, KTRSYL, LTREXC, LTRSYL,
-     $                   NTESTS, NTREXC, NTRSYL
-      DOUBLE PRECISION   EPS, RTREXC, RTRSYL, SFMIN
+      INTEGER            KTREXC, KTRSEN, KTRSNA, KTRSYL, KTRSYL3,
+     $                   LTREXC, LTRSYL, NTESTS, NTREXC, NTRSYL
+      DOUBLE PRECISION   EPS, RTREXC, SFMIN
 *     ..
 *     .. Local Arrays ..
-      INTEGER            LTRSEN( 3 ), LTRSNA( 3 ), NTRSEN( 3 ),
-     $                   NTRSNA( 3 )
-      DOUBLE PRECISION   RTRSEN( 3 ), RTRSNA( 3 )
+      INTEGER            FTRSYL( 3 ), ITRSYL( 2 ), LTRSEN( 3 ),
+     $                   LTRSNA( 3 ), NTRSEN( 3 ), NTRSNA( 3 )
+      DOUBLE PRECISION   RTRSEN( 3 ), RTRSNA( 3 ), RTRSYL( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZERREC, ZGET35, ZGET36, ZGET37, ZGET38
+      EXTERNAL           ZERREC, ZGET35, ZGET36, ZGET37, ZGET38, ZSYL01
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH
@@ -120,10 +120,24 @@
      $   CALL ZERREC( PATH, NOUT )
 *
       OK = .TRUE.
-      CALL ZGET35( RTRSYL, LTRSYL, NTRSYL, KTRSYL, NIN )
-      IF( RTRSYL.GT.THRESH ) THEN
+      CALL ZGET35( RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL, NIN )
+      IF( RTRSYL( 1 ).GT.THRESH ) THEN
          OK = .FALSE.
-         WRITE( NOUT, FMT = 9999 )RTRSYL, LTRSYL, NTRSYL, KTRSYL
+         WRITE( NOUT, FMT = 9999 )RTRSYL( 1 ), LTRSYL, NTRSYL, KTRSYL
+      END IF
+*
+      CALL ZSYL01( THRESH, FTRSYL, RTRSYL, ITRSYL, KTRSYL3 )
+      IF( FTRSYL( 1 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9970 )FTRSYL( 1 ), RTRSYL( 1 ), THRESH
+      END IF
+      IF( FTRSYL( 2 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9971 )FTRSYL( 2 ), RTRSYL( 2 ), THRESH
+      END IF
+      IF( FTRSYL( 3 ).GT.0 ) THEN
+         OK = .FALSE.
+         WRITE( NOUT, FMT = 9972 )FTRSYL( 3 )
       END IF
 *
       CALL ZGET36( RTREXC, LTREXC, NTREXC, KTREXC, NIN )
@@ -148,7 +162,7 @@
          WRITE( NOUT, FMT = 9996 )RTRSEN, LTRSEN, NTRSEN, KTRSEN
       END IF
 *
-      NTESTS = KTRSYL + KTREXC + KTRSNA + KTRSEN
+      NTESTS = KTRSYL + KTRSYL3 + KTREXC + KTRSNA + KTRSEN
       IF( OK )
      $   WRITE( NOUT, FMT = 9995 )PATH, NTESTS
 *
@@ -169,6 +183,12 @@
      $      / ' Safe minimum (SFMIN)             = ', D16.6, / )
  9992 FORMAT( ' Routines pass computational tests if test ratio is ',
      $      'less than', F8.2, / / )
+ 9970 FORMAT( 'Error in ZTRSYL: ', I8, ' tests fail the threshold.', /
+     $      'Maximum test ratio =', D12.3, ' threshold =', D12.3 )
+ 9971 FORMAT( 'Error in ZTRSYL3: ', I8, ' tests fail the threshold.', /
+     $      'Maximum test ratio =', D12.3, ' threshold =', D12.3 )
+ 9972 FORMAT( 'ZTRSYL and ZTRSYL3 compute an inconsistent scale ',
+     $      'factor in ', I8, ' tests.')
       RETURN
 *
 *     End of ZCHKEC
diff --git a/lapack-netlib/TESTING/EIG/zerrec.f b/lapack-netlib/TESTING/EIG/zerrec.f
index dc6129da9..e1938f57d 100644
--- a/lapack-netlib/TESTING/EIG/zerrec.f
+++ b/lapack-netlib/TESTING/EIG/zerrec.f
@@ -23,7 +23,7 @@
 *>
 *> ZERREC tests the error exits for the routines for eigen- condition
 *> estimation for DOUBLE PRECISION matrices:
-*>    ZTRSYL, ZTREXC, ZTRSNA and ZTRSEN.
+*>    ZTRSYL, ZTRSYL3, ZTREXC, ZTRSNA and ZTRSEN.
 *> \endverbatim
 *
 *  Arguments:
@@ -77,7 +77,7 @@
 *     ..
 *     .. Local Arrays ..
       LOGICAL            SEL( NMAX )
-      DOUBLE PRECISION   RW( LW ), S( NMAX ), SEP( NMAX )
+      DOUBLE PRECISION   RW( LW ), S( NMAX ), SEP( NMAX ), SWORK( NMAX )
       COMPLEX*16         A( NMAX, NMAX ), B( NMAX, NMAX ),
      $                   C( NMAX, NMAX ), WORK( LW ), X( NMAX )
 *     ..
@@ -141,6 +141,43 @@
       CALL CHKXER( 'ZTRSYL', INFOT, NOUT, LERR, OK )
       NT = NT + 8
 *
+*     Test ZTRSYL3
+*
+      SRNAMT = 'ZTRSYL3'
+      INFOT = 1
+      CALL ZTRSYL3( 'X', 'N', 1, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 2
+      CALL ZTRSYL3( 'N', 'X', 1, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 3
+      CALL ZTRSYL3( 'N', 'N', 0, 0, 0, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 4
+      CALL ZTRSYL3( 'N', 'N', 1, -1, 0, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 5
+      CALL ZTRSYL3( 'N', 'N', 1, 0, -1, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 7
+      CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 1, B, 1, C, 2, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 9
+      CALL ZTRSYL3( 'N', 'N', 1, 0, 2, A, 1, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK )
+      INFOT = 11
+      CALL ZTRSYL3( 'N', 'N', 1, 2, 0, A, 2, B, 1, C, 1, SCALE,
+     $              SWORK, NMAX, INFO )
+      CALL CHKXER( 'ZTRSYL3', INFOT, NOUT, LERR, OK )
+      NT = NT + 8
+*
 *     Test ZTREXC
 *
       SRNAMT = 'ZTREXC'
diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f
new file mode 100644
index 000000000..1e8619a34
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/zsyl01.f
@@ -0,0 +1,294 @@
+*> \brief \b ZSYL01
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT )
+*
+*     .. Scalar Arguments ..
+*     INTEGER            KNT
+*     DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+*     INTEGER            NFAIL( 3 ), NINFO( 2 )
+*     DOUBLE PRECISION   RMAX( 2 )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZSYL01 tests ZTRSYL and ZTRSYL3, routines for solving the Sylvester matrix
+*> equation
+*>
+*>    op(A)*X + ISGN*X*op(B) = scale*C,
+*>
+*> where op(A) and op(B) are both upper triangular form, op() represents an
+*> optional conjugate transpose, and ISGN can be -1 or +1. Scale is an output
+*> less than or equal to 1, chosen to avoid overflow in X.
+*>
+*> The test code verifies that the following residual does not exceed
+*> the provided threshold:
+*>
+*>    norm(op(A)*X + ISGN*X*op(B) - scale*C) /
+*>        (EPS*max(norm(A),norm(B))*norm(X))
+*>
+*> This routine complements ZGET35 by testing with larger,
+*> random matrices, of which some require rescaling of X to avoid overflow.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is DOUBLE PRECISION
+*>          A test will count as "failed" if the residual, computed as
+*>          described above, exceeds THRESH.
+*> \endverbatim
+*>
+*> \param[out] NFAIL
+*> \verbatim
+*>          NFAIL is INTEGER array, dimension (3)
+*>          NFAIL(1) = No. of times residual ZTRSYL exceeds threshold THRESH
+*>          NFAIL(2) = No. of times residual ZTRSYL3 exceeds threshold THRESH
+*>          NFAIL(3) = No. of times ZTRSYL3 and ZTRSYL deviate
+*> \endverbatim
+*>
+*> \param[out] RMAX
+*> \verbatim
+*>          RMAX is DOUBLE PRECISION array, dimension (2)
+*>          RMAX(1) = Value of the largest test ratio of ZTRSYL
+*>          RMAX(2) = Value of the largest test ratio of ZTRSYL3
+*> \endverbatim
+*>
+*> \param[out] NINFO
+*> \verbatim
+*>          NINFO is INTEGER array, dimension (2)
+*>          NINFO(1) = No. of times ZTRSYL returns an expected INFO
+*>          NINFO(2) = No. of times ZTRSYL3 returns an expected INFO
+*> \endverbatim
+*>
+*> \param[out] KNT
+*> \verbatim
+*>          KNT is INTEGER
+*>          Total number of examples tested.
+*> \endverbatim
+
+*
+*  -- LAPACK test routine --
+      SUBROUTINE ZSYL01( THRESH, NFAIL, RMAX, NINFO, KNT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            KNT
+      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+      INTEGER            NFAIL( 3 ), NINFO( 2 )
+      DOUBLE PRECISION   RMAX( 2 )
+*     ..
+*
+*  =====================================================================
+*     ..
+*     .. Parameters ..
+      COMPLEX*16         CONE
+      PARAMETER          ( CONE = ( 1.0D0, 0.0D+0 ) )
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      INTEGER            MAXM, MAXN, LDSWORK
+      PARAMETER          ( MAXM = 185, MAXN = 192, LDSWORK = 36 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          TRANA, TRANB
+      INTEGER            I, INFO, IINFO, ISGN, ITRANA, ITRANB, J, KLA,
+     $                   KUA, KLB, KUB, M, N
+      DOUBLE PRECISION   ANRM, BNRM, BIGNUM, EPS, RES, RES1,
+     $                   SCALE, SCALE3, SMLNUM, TNRM, XNRM
+      COMPLEX*16         RMUL
+*     ..
+*     .. Local Arrays ..
+      COMPLEX*16         A( MAXM, MAXM ), B( MAXN, MAXN ),
+     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
+     $                   X( MAXM, MAXN ),
+     $                   DUML( MAXM ), DUMR( MAXN ),
+     $                   D( MIN( MAXM, MAXN ) )
+      DOUBLE PRECISION   SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 )
+      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      DOUBLE PRECISION   DLAMCH, ZLANGE
+      EXTERNAL           DISNAN, DLAMCH, ZLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLATMR, ZLACPY, ZGEMM, ZTRSYL, ZTRSYL3
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, MAX, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Get machine parameters
+*
+      EPS = DLAMCH( 'P' )
+      SMLNUM = DLAMCH( 'S' ) / EPS
+      BIGNUM = ONE / SMLNUM
+*
+*     Expect INFO = 0
+      VM( 1 ) = ONE
+*     Expect INFO = 1
+      VM( 2 ) = 0.05D+0
+*
+*     Begin test loop
+*
+      NINFO( 1 ) = 0
+      NINFO( 2 ) = 0
+      NFAIL( 1 ) = 0
+      NFAIL( 2 ) = 0
+      NFAIL( 3 ) = 0
+      RMAX( 1 ) = ZERO
+      RMAX( 2 ) = ZERO
+      KNT = 0
+      ISEED( 1 ) = 1
+      ISEED( 2 ) = 1
+      ISEED( 3 ) = 1
+      ISEED( 4 ) = 1
+      SCALE = ONE
+      SCALE3 = ONE
+      DO J = 1, 2
+         DO ISGN = -1, 1, 2
+*           Reset seed (overwritten by LATMR)
+            ISEED( 1 ) = 1
+            ISEED( 2 ) = 1
+            ISEED( 3 ) = 1
+            ISEED( 4 ) = 1
+            DO M = 32, MAXM, 51
+               KLA = 0
+               KUA = M - 1
+               CALL ZLATMR( M, M, 'S', ISEED, 'N', D,
+     $                      6, ONE, CONE, 'T', 'N',
+     $                      DUML, 1, ONE, DUMR, 1, ONE,
+     $                      'N', IWORK, KLA, KUA, ZERO,
+     $                      ONE, 'NO', A, MAXM, IWORK,
+     $                      IINFO )
+               DO I = 1, M
+                  A( I, I ) = A( I, I ) * VM( J )
+               END DO
+               ANRM = ZLANGE( 'M', M, M, A, MAXM, DUM )
+               DO N = 51, MAXN, 47
+                  KLB = 0
+                  KUB = N - 1
+                  CALL ZLATMR( N, N, 'S', ISEED, 'N', D,
+     $                         6, ONE, CONE, 'T', 'N',
+     $                         DUML, 1, ONE, DUMR, 1, ONE,
+     $                         'N', IWORK, KLB, KUB, ZERO,
+     $                         ONE, 'NO', B, MAXN, IWORK,
+     $                         IINFO )
+                  DO I = 1, N
+                     B( I, I ) = B( I, I ) * VM ( J )
+                  END DO
+                  BNRM = ZLANGE( 'M', N, N, B, MAXN, DUM )
+                  TNRM = MAX( ANRM, BNRM )
+                  CALL ZLATMR( M, N, 'S', ISEED, 'N', D,
+     $                         6, ONE, CONE, 'T', 'N',
+     $                         DUML, 1, ONE, DUMR, 1, ONE,
+     $                         'N', IWORK, M, N, ZERO, ONE,
+     $                         'NO', C, MAXM, IWORK, IINFO )
+                  DO ITRANA = 1, 2
+                     IF( ITRANA.EQ.1 )
+     $                   TRANA = 'N'
+                     IF( ITRANA.EQ.2 )
+     $                   TRANA = 'C'
+                     DO ITRANB = 1, 2
+                        IF( ITRANB.EQ.1 )
+     $                     TRANB = 'N'
+                        IF( ITRANB.EQ.2 )
+     $                     TRANB = 'C'
+                        KNT = KNT + 1
+*
+                        CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM)
+                        CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM)
+                        CALL ZTRSYL( TRANA, TRANB, ISGN, M, N, 
+     $                               A, MAXM, B, MAXN, X, MAXM,
+     $                               SCALE, IINFO )
+                        IF( IINFO.NE.0 )
+     $                     NINFO( 1 ) = NINFO( 1 ) + 1
+                        XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM )
+                        RMUL = CONE
+                        IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN
+                           IF( XNRM.GT.BIGNUM / TNRM ) THEN
+                              RMUL = CONE / MAX( XNRM, TNRM )
+                           END IF
+                        END IF
+                        CALL ZGEMM( TRANA, 'N', M, N, M, RMUL,
+     $                              A, MAXM, X, MAXM, -SCALE*RMUL,
+     $                              CC, MAXM )
+                        CALL ZGEMM( 'N', TRANB, M, N, N,
+     $                              DBLE( ISGN )*RMUL, X, MAXM, B,
+     $                              MAXN, CONE, CC, MAXM )
+                        RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM )
+                        RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM,
+     $                        ( ( ABS( RMUL )*TNRM )*EPS )*XNRM )
+                        IF( RES.GT.THRESH )
+     $                     NFAIL( 1 ) = NFAIL( 1 ) + 1
+                        IF( RES.GT.RMAX( 1 ) )
+     $                     RMAX( 1 ) = RES
+*
+                        CALL ZLACPY( 'All', M, N, C, MAXM, X, MAXM )
+                        CALL ZLACPY( 'All', M, N, C, MAXM, CC, MAXM )
+                        CALL ZTRSYL3( TRANA, TRANB, ISGN, M, N,
+     $                                A, MAXM, B, MAXN, X, MAXM,
+     $                                SCALE3, SWORK, LDSWORK, INFO)
+                        IF( INFO.NE.0 )
+     $                     NINFO( 2 ) = NINFO( 2 ) + 1
+                        XNRM = ZLANGE( 'M', M, N, X, MAXM, DUM )
+                        RMUL = CONE
+                        IF( XNRM.GT.ONE .AND. TNRM.GT.ONE ) THEN
+                           IF( XNRM.GT.BIGNUM / TNRM ) THEN
+                              RMUL = CONE / MAX( XNRM, TNRM )
+                           END IF
+                        END IF
+                        CALL ZGEMM( TRANA, 'N', M, N, M, RMUL,
+     $                              A, MAXM, X, MAXM, -SCALE3*RMUL,
+     $                              CC, MAXM )
+                        CALL ZGEMM( 'N', TRANB, M, N, N,
+     $                              DBLE( ISGN )*RMUL, X, MAXM, B,
+     $                              MAXN, CONE, CC, MAXM )
+                        RES1 = ZLANGE( 'M', M, N, CC, MAXM, DUM )
+                        RES = RES1 / MAX( SMLNUM, SMLNUM*XNRM,
+     $                             ( ( ABS( RMUL )*TNRM )*EPS )*XNRM )
+*                       Verify that TRSYL3 only flushes if TRSYL flushes (but
+*                       there may be cases where TRSYL3 avoid flushing).
+                        IF( SCALE3.EQ.ZERO .AND. SCALE.GT.ZERO .OR.
+     $                      IINFO.NE.INFO ) THEN
+                           NFAIL( 3 ) = NFAIL( 3 ) + 1
+                        END IF
+                        IF( RES.GT.THRESH .OR. DISNAN( RES ) )
+     $                     NFAIL( 2 ) = NFAIL( 2 ) + 1
+                        IF( RES.GT.RMAX( 2 ) )
+     $                     RMAX( 2 ) = RES
+                     END DO
+                  END DO
+               END DO
+            END DO
+         END DO
+      END DO
+*
+      RETURN
+*
+*     End of ZSYL01
+*
+      END

From 13f3bbece1786da4236e128c29bfeeedfed20869 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 13 Nov 2022 23:18:09 +0100
Subject: [PATCH 088/154] Add a BLAS3-based triangular Sylvester equation
 solver (Reference-LAPACK PR 651)

---
 lapack-netlib/TESTING/LIN/cchktr.f | 54 ++++++++++++++++++++++-----
 lapack-netlib/TESTING/LIN/cerrtr.f | 47 +++++++++++++++++++++--
 lapack-netlib/TESTING/LIN/dchktr.f | 56 ++++++++++++++++++++++------
 lapack-netlib/TESTING/LIN/derrtr.f | 47 +++++++++++++++++++++--
 lapack-netlib/TESTING/LIN/schktr.f | 57 ++++++++++++++++++++++------
 lapack-netlib/TESTING/LIN/serrtr.f | 47 +++++++++++++++++++++--
 lapack-netlib/TESTING/LIN/zchktr.f | 60 +++++++++++++++++++++++-------
 lapack-netlib/TESTING/LIN/zerrtr.f | 47 +++++++++++++++++++++--
 8 files changed, 358 insertions(+), 57 deletions(-)

diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f
index ce1ecf761..c55b07643 100644
--- a/lapack-netlib/TESTING/LIN/cchktr.f
+++ b/lapack-netlib/TESTING/LIN/cchktr.f
@@ -31,7 +31,7 @@
 *>
 *> \verbatim
 *>
-*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS
+*> CCHKTR tests CTRTRI, -TRS, -RFS, and -CON, and CLATRS(3)
 *> \endverbatim
 *
 *  Arguments:
@@ -184,7 +184,7 @@
       INTEGER            NTYPE1, NTYPES
       PARAMETER          ( NTYPE1 = 10, NTYPES = 18 )
       INTEGER            NTESTS
-      PARAMETER          ( NTESTS = 9 )
+      PARAMETER          ( NTESTS = 10 )
       INTEGER            NTRAN
       PARAMETER          ( NTRAN = 3 )
       REAL               ONE, ZERO
@@ -195,13 +195,13 @@
       CHARACTER*3        PATH
       INTEGER            I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN,
      $                   IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN
-      REAL               AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI,
-     $                   RCONDO, SCALE
+      REAL               AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC,
+     $                   RCONDI, RCONDO, RES, SCALE, SLAMCH
 *     ..
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      REAL               RESULT( NTESTS )
+      REAL               RESULT( NTESTS ), SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -210,9 +210,9 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAERH, ALAHD, ALASUM, CCOPY, CERRTR, CGET04,
-     $                   CLACPY, CLARHS, CLATRS, CLATTR, CTRCON, CTRRFS,
-     $                   CTRT01, CTRT02, CTRT03, CTRT05, CTRT06, CTRTRI,
-     $                   CTRTRS, XLAENV
+     $                   CLACPY, CLARHS, CLATRS, CLATRS3, CLATTR,
+     $                   CSSCAL, CTRCON, CTRRFS, CTRT01, CTRT02, CTRT03,
+     $                   CTRT05, CTRT06, CTRTRI, CTRTRS, XLAENV, SLAMCH
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -236,6 +236,7 @@
 *
       PATH( 1: 1 ) = 'Complex precision'
       PATH( 2: 3 ) = 'TR'
+      BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision')
       NRUN = 0
       NFAIL = 0
       NERRS = 0
@@ -380,7 +381,7 @@
 *                       This line is needed on a Sun SPARCstation.
 *
                         IF( N.GT.0 )
-     $                     DUMMY = A( 1 )
+     $                     DUMMY = REAL( A( 1 ) )
 *
                         CALL CTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA,
      $                               X, LDA, B, LDA, WORK, RWORK,
@@ -535,6 +536,32 @@
      $                         RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK,
      $                         RESULT( 9 ) )
 *
+*+    TEST 10
+*                 Solve op(A)*X = B.
+*
+                  SRNAMT = 'CLATRS3'
+                  CALL CCOPY( N, X, 1, B, 1 )
+                  CALL CCOPY( N, X, 1, B, 1 )
+                  CALL CSCAL( N, BIGNUM, B( N+1 ), 1 )
+                  CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
+     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
+     $                          INFO )
+*
+*                 Check error code from CLATRS3.
+*
+                  IF( INFO.NE.0 )
+     $               CALL ALAERH( PATH, 'CLATRS3', INFO, 0,
+     $                            UPLO // TRANS // DIAG // 'Y', N, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS, NOUT )
+                  CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA,
+     $                         X, LDA, WORK, RESULT( 10 ) )
+                  CALL CSSCAL( N, BIGNUM, X, 1 )
+                  CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA,
+     $                         X, LDA, WORK, RESULT( 10 ) )
+                  RESULT( 10 ) = MAX( RESULT( 10 ), RES )
+*
 *                 Print information about the tests that did not pass
 *                 the threshold.
 *
@@ -552,7 +579,14 @@
      $                  DIAG, 'Y', N, IMAT, 9, RESULT( 9 )
                      NFAIL = NFAIL + 1
                   END IF
-                  NRUN = NRUN + 2
+                  IF( RESULT( 10 ).GE.THRESH ) THEN
+                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                  CALL ALAHD( NOUT, PATH )
+                     WRITE( NOUT, FMT = 9996 )'CLATRS3', UPLO, TRANS,
+     $                  DIAG, 'N', N, IMAT, 10, RESULT( 10 )
+                     NFAIL = NFAIL + 1
+                  END IF
+                  NRUN = NRUN + 3
    90          CONTINUE
   100       CONTINUE
   110    CONTINUE
diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f
index db65edd88..9ba784f62 100644
--- a/lapack-netlib/TESTING/LIN/cerrtr.f
+++ b/lapack-netlib/TESTING/LIN/cerrtr.f
@@ -82,9 +82,10 @@
       EXTERNAL           LSAMEN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, CLATBS, CLATPS, CLATRS, CTBCON,
-     $                   CTBRFS, CTBTRS, CTPCON, CTPRFS, CTPTRI, CTPTRS,
-     $                   CTRCON, CTRRFS, CTRTI2, CTRTRI, CTRTRS
+      EXTERNAL           ALAESM, CHKXER, CLATBS, CLATPS, CLATRS,
+     $                   CLATRS3, CTBCON, CTBRFS, CTBTRS, CTPCON,
+     $                   CTPRFS, CTPTRI, CTPTRS, CTRCON, CTRRFS, CTRTI2,
+     $                   CTRTRI, CTRTRS
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -240,6 +241,46 @@
          CALL CLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO )
          CALL CHKXER( 'CLATRS', INFOT, NOUT, LERR, OK )
 *
+*        CLATRS3
+*
+         SRNAMT = 'CLATRS3'
+         INFOT = 1
+         CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 0, INFO )
+         CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
+*
 *     Test error exits for the packed triangular routines.
 *
       ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN
diff --git a/lapack-netlib/TESTING/LIN/dchktr.f b/lapack-netlib/TESTING/LIN/dchktr.f
index a4a1150c0..57e87326b 100644
--- a/lapack-netlib/TESTING/LIN/dchktr.f
+++ b/lapack-netlib/TESTING/LIN/dchktr.f
@@ -30,7 +30,7 @@
 *>
 *> \verbatim
 *>
-*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS
+*> DCHKTR tests DTRTRI, -TRS, -RFS, and -CON, and DLATRS(3)
 *> \endverbatim
 *
 *  Arguments:
@@ -187,7 +187,7 @@
       INTEGER            NTYPE1, NTYPES
       PARAMETER          ( NTYPE1 = 10, NTYPES = 18 )
       INTEGER            NTESTS
-      PARAMETER          ( NTESTS = 9 )
+      PARAMETER          ( NTESTS = 10 )
       INTEGER            NTRAN
       PARAMETER          ( NTRAN = 3 )
       DOUBLE PRECISION   ONE, ZERO
@@ -198,13 +198,13 @@
       CHARACTER*3        PATH
       INTEGER            I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN,
      $                   IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN
-      DOUBLE PRECISION   AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI,
-     $                   RCONDO, SCALE
+      DOUBLE PRECISION   AINVNM, ANORM, BIGNUM, DLAMCH, DUMMY, RCOND,
+     $                   RCONDC, RCONDI, RCONDO, RES, SCALE
 *     ..
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      DOUBLE PRECISION   RESULT( NTESTS )
+      DOUBLE PRECISION   RESULT( NTESTS ), SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -213,9 +213,9 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAERH, ALAHD, ALASUM, DCOPY, DERRTR, DGET04,
-     $                   DLACPY, DLARHS, DLATRS, DLATTR, DTRCON, DTRRFS,
-     $                   DTRT01, DTRT02, DTRT03, DTRT05, DTRT06, DTRTRI,
-     $                   DTRTRS, XLAENV
+     $                   DLACPY, DLAMCH, DSCAL, DLARHS, DLATRS, DLATRS3,
+     $                   DLATTR, DTRCON, DTRRFS, DTRT01, DTRT02, DTRT03,
+     $                   DTRT05, DTRT06, DTRTRI, DTRTRS, XLAENV
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -239,6 +239,7 @@
 *
       PATH( 1: 1 ) = 'Double precision'
       PATH( 2: 3 ) = 'TR'
+      BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision')
       NRUN = 0
       NFAIL = 0
       NERRS = 0
@@ -539,6 +540,32 @@
      $                         RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK,
      $                         RESULT( 9 ) )
 *
+*+    TEST 10
+*                 Solve op(A)*X = B
+*
+                  SRNAMT = 'DLATRS3'
+                  CALL DCOPY( N, X, 1, B, 1 )
+                  CALL DCOPY( N, X, 1, B( N+1 ), 1 )
+                  CALL DSCAL( N, BIGNUM, B( N+1 ), 1 )
+                  CALL DLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
+     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
+     $                          INFO )
+*
+*                 Check error code from DLATRS3.
+*
+                  IF( INFO.NE.0 )
+     $               CALL ALAERH( PATH, 'DLATRS3', INFO, 0,
+     $                            UPLO // TRANS // DIAG // 'N', N, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS, NOUT )
+                  CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA,
+     $                         X, LDA, WORK, RESULT( 10 ) )
+                  CALL DSCAL( N, BIGNUM, X, 1 )
+                  CALL DTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA,
+     $                         X, LDA, WORK, RES )
+                  RESULT( 10 ) = MAX( RESULT( 10 ), RES )
+*
 *                 Print information about the tests that did not pass
 *                 the threshold.
 *
@@ -556,7 +583,14 @@
      $                  DIAG, 'Y', N, IMAT, 9, RESULT( 9 )
                      NFAIL = NFAIL + 1
                   END IF
-                  NRUN = NRUN + 2
+                  IF( RESULT( 10 ).GE.THRESH ) THEN
+                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                  CALL ALAHD( NOUT, PATH )
+                     WRITE( NOUT, FMT = 9996 )'DLATRS3', UPLO, TRANS,
+     $                  DIAG, 'N', N, IMAT, 10, RESULT( 10 )
+                     NFAIL = NFAIL + 1
+                  END IF
+                  NRUN = NRUN + 3
    90          CONTINUE
   100       CONTINUE
   110    CONTINUE
@@ -569,8 +603,8 @@
  9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=',
      $      I4, ', type ', I2, ', test(', I2, ')= ', G12.5 )
  9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1,
-     $      ''', N=', I5, ', NB=', I4, ', type ', I2, ',
-     $      test(', I2, ')= ', G12.5 )
+     $      ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(',
+     $      I2, ')= ', G12.5 )
  9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',',
      $      11X, ' type ', I2, ', test(', I2, ')=', G12.5 )
  9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''',
diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f
index a667f0d2b..d0580497d 100644
--- a/lapack-netlib/TESTING/LIN/derrtr.f
+++ b/lapack-netlib/TESTING/LIN/derrtr.f
@@ -83,9 +83,10 @@
       EXTERNAL           LSAMEN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, DLATBS, DLATPS, DLATRS, DTBCON,
-     $                   DTBRFS, DTBTRS, DTPCON, DTPRFS, DTPTRI, DTPTRS,
-     $                   DTRCON, DTRRFS, DTRTI2, DTRTRI, DTRTRS
+      EXTERNAL           ALAESM, CHKXER, DLATBS, DLATPS, DLATRS,
+     $                   DLATRS3, DTBCON, DTBRFS, DTBTRS, DTPCON,
+     $                   DTPRFS, DTPTRI, DTPTRS, DTRCON, DTRRFS,
+     $                   DTRTI2, DTRTRI, DTRTRS
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -244,6 +245,46 @@
          INFOT = 7
          CALL DLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO )
          CALL CHKXER( 'DLATRS', INFOT, NOUT, LERR, OK )
+*
+*        DLATRS3
+*
+         SRNAMT = 'DLATRS3'
+         INFOT = 1
+         CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 0, INFO )
+         CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN
 *
diff --git a/lapack-netlib/TESTING/LIN/schktr.f b/lapack-netlib/TESTING/LIN/schktr.f
index 66fa0bee7..5aeb1ce88 100644
--- a/lapack-netlib/TESTING/LIN/schktr.f
+++ b/lapack-netlib/TESTING/LIN/schktr.f
@@ -30,7 +30,7 @@
 *>
 *> \verbatim
 *>
-*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS
+*> SCHKTR tests STRTRI, -TRS, -RFS, and -CON, and SLATRS(3)
 *> \endverbatim
 *
 *  Arguments:
@@ -187,7 +187,7 @@
       INTEGER            NTYPE1, NTYPES
       PARAMETER          ( NTYPE1 = 10, NTYPES = 18 )
       INTEGER            NTESTS
-      PARAMETER          ( NTESTS = 9 )
+      PARAMETER          ( NTESTS = 10 )
       INTEGER            NTRAN
       PARAMETER          ( NTRAN = 3 )
       REAL               ONE, ZERO
@@ -198,13 +198,13 @@
       CHARACTER*3        PATH
       INTEGER            I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN,
      $                   IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN
-      REAL               AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI,
-     $                   RCONDO, SCALE
+      REAL               AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC,
+     $                   RCONDI, RCONDO, RES, SCALE, SLAMCH
 *     ..
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      REAL               RESULT( NTESTS )
+      REAL               RESULT( NTESTS ), SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -213,9 +213,9 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAERH, ALAHD, ALASUM, SCOPY, SERRTR, SGET04,
-     $                   SLACPY, SLARHS, SLATRS, SLATTR, STRCON, STRRFS,
-     $                   STRT01, STRT02, STRT03, STRT05, STRT06, STRTRI,
-     $                   STRTRS, XLAENV
+     $                   SLACPY, SLARHS, SLATRS, SLATRS3, SLATTR, SSCAL,
+     $                   STRCON, STRRFS, STRT01, STRT02, STRT03, STRT05,
+     $                   STRT06, STRTRI, STRTRS, XLAENV, SLAMCH
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -239,6 +239,7 @@
 *
       PATH( 1: 1 ) = 'Single precision'
       PATH( 2: 3 ) = 'TR'
+      BIGNUM = SLAMCH('Overflow') / SLAMCH('Precision')
       NRUN = 0
       NFAIL = 0
       NERRS = 0
@@ -539,6 +540,33 @@
      $                         RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK,
      $                         RESULT( 9 ) )
 *
+*+    TEST 10
+*                 Solve op(A)*X = B
+*
+                  SRNAMT = 'SLATRS3'
+                  CALL SCOPY( N, X, 1, B, 1 )
+                  CALL SCOPY( N, X, 1, B( N+1 ), 1 )
+                  CALL SSCAL( N, BIGNUM, B( N+1 ), 1 )
+                  CALL SLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
+     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
+     $                          INFO )
+*
+*                 Check error code from SLATRS3.
+*
+                  IF( INFO.NE.0 )
+     $               CALL ALAERH( PATH, 'SLATRS3', INFO, 0,
+     $                            UPLO // TRANS // DIAG // 'Y', N, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS, NOUT )
+*
+                  CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3 ( 1 ), RWORK, ONE, B( N+1 ), LDA,
+     $                         X, LDA, WORK, RESULT( 10 ) )
+                  CALL SSCAL( N, BIGNUM, X, 1 )
+                  CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA,
+     $                         X, LDA, WORK, RES )
+                  RESULT( 10 ) = MAX( RESULT( 10 ), RES )
+*
 *                 Print information about the tests that did not pass
 *                 the threshold.
 *
@@ -556,7 +584,14 @@
      $                  DIAG, 'Y', N, IMAT, 9, RESULT( 9 )
                      NFAIL = NFAIL + 1
                   END IF
-                  NRUN = NRUN + 2
+                  IF( RESULT( 10 ).GE.THRESH ) THEN
+                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                  CALL ALAHD( NOUT, PATH )
+                     WRITE( NOUT, FMT = 9996 )'SLATRS3', UPLO, TRANS,
+     $                  DIAG, 'N', N, IMAT, 10, RESULT( 10 )
+                     NFAIL = NFAIL + 1
+                  END IF
+                  NRUN = NRUN + 3
    90          CONTINUE
   100       CONTINUE
   110    CONTINUE
@@ -569,8 +604,8 @@
  9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=',
      $      I4, ', type ', I2, ', test(', I2, ')= ', G12.5 )
  9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1,
-     $      ''', N=', I5, ', NB=', I4, ', type ', I2, ',
-     $      test(', I2, ')= ', G12.5 )
+     $      ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(',
+     $      I2, ')= ', G12.5 )
  9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',',
      $      11X, ' type ', I2, ', test(', I2, ')=', G12.5 )
  9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''',
diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f
index f0d0a0ef2..af1ce0a8e 100644
--- a/lapack-netlib/TESTING/LIN/serrtr.f
+++ b/lapack-netlib/TESTING/LIN/serrtr.f
@@ -83,9 +83,10 @@
       EXTERNAL           LSAMEN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, SLATBS, SLATPS, SLATRS, STBCON,
-     $                   STBRFS, STBTRS, STPCON, STPRFS, STPTRI, STPTRS,
-     $                   STRCON, STRRFS, STRTI2, STRTRI, STRTRS
+      EXTERNAL           ALAESM, CHKXER, SLATBS, SLATPS, SLATRS,
+     $                   SLATRS3, STBCON, STBRFS, STBTRS, STPCON,
+     $                   STPRFS, STPTRI, STPTRS, STRCON, STRRFS, STRTI2,
+     $                   STRTRI, STRTRS
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -244,6 +245,46 @@
          INFOT = 7
          CALL SLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, W, INFO )
          CALL CHKXER( 'SLATRS', INFOT, NOUT, LERR, OK )
+*
+*        SLATRS3
+*
+         SRNAMT = 'SLATRS3'
+         INFOT = 1
+         CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W,
+     $                 W( 2 ), 1, INFO )
+         CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W,
+     $                 W( 2 ), 0, INFO )
+         CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN
 *
diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f
index 0a6f47b1e..275ca2857 100644
--- a/lapack-netlib/TESTING/LIN/zchktr.f
+++ b/lapack-netlib/TESTING/LIN/zchktr.f
@@ -31,7 +31,7 @@
 *>
 *> \verbatim
 *>
-*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS
+*> ZCHKTR tests ZTRTRI, -TRS, -RFS, and -CON, and ZLATRS(3)
 *> \endverbatim
 *
 *  Arguments:
@@ -184,7 +184,7 @@
       INTEGER            NTYPE1, NTYPES
       PARAMETER          ( NTYPE1 = 10, NTYPES = 18 )
       INTEGER            NTESTS
-      PARAMETER          ( NTESTS = 9 )
+      PARAMETER          ( NTESTS = 10 )
       INTEGER            NTRAN
       PARAMETER          ( NTRAN = 3 )
       DOUBLE PRECISION   ONE, ZERO
@@ -195,13 +195,13 @@
       CHARACTER*3        PATH
       INTEGER            I, IDIAG, IMAT, IN, INB, INFO, IRHS, ITRAN,
      $                   IUPLO, K, LDA, N, NB, NERRS, NFAIL, NRHS, NRUN
-      DOUBLE PRECISION   AINVNM, ANORM, DUMMY, RCOND, RCONDC, RCONDI,
-     $                   RCONDO, SCALE
+      DOUBLE PRECISION   AINVNM, ANORM, BIGNUM, DUMMY, RCOND, RCONDC,
+     $                   RCONDI, RCONDO, RES, SCALE, DLAMCH
 *     ..
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      DOUBLE PRECISION   RESULT( NTESTS )
+      DOUBLE PRECISION   RESULT( NTESTS ), SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -209,10 +209,10 @@
       EXTERNAL           LSAME, ZLANTR
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, XLAENV, ZCOPY, ZERRTR,
-     $                   ZGET04, ZLACPY, ZLARHS, ZLATRS, ZLATTR, ZTRCON,
-     $                   ZTRRFS, ZTRT01, ZTRT02, ZTRT03, ZTRT05, ZTRT06,
-     $                   ZTRTRI, ZTRTRS
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DLAMCH, XLAENV, ZCOPY,
+     $                   ZDSCAL, ZERRTR, ZGET04, ZLACPY, ZLARHS, ZLATRS,
+     $                   ZLATRS3, ZLATTR, ZTRCON, ZTRRFS, ZTRT01,
+     $                   ZTRT02, ZTRT03, ZTRT05, ZTRT06, ZTRTRI, ZTRTRS
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -236,6 +236,7 @@
 *
       PATH( 1: 1 ) = 'Zomplex precision'
       PATH( 2: 3 ) = 'TR'
+      BIGNUM = DLAMCH('Overflow') / DLAMCH('Precision')
       NRUN = 0
       NFAIL = 0
       NERRS = 0
@@ -380,7 +381,7 @@
 *                       This line is needed on a Sun SPARCstation.
 *
                         IF( N.GT.0 )
-     $                     DUMMY = A( 1 )
+     $                     DUMMY = DBLE( A( 1 ) )
 *
                         CALL ZTRT02( UPLO, TRANS, DIAG, N, NRHS, A, LDA,
      $                               X, LDA, B, LDA, WORK, RWORK,
@@ -535,6 +536,32 @@
      $                         RWORK, ONE, B( N+1 ), LDA, X, LDA, WORK,
      $                         RESULT( 9 ) )
 *
+*+    TEST 10
+*                 Solve op(A)*X = B
+*
+                  SRNAMT = 'ZLATRS3'
+                  CALL ZCOPY( N, X, 1, B, 1 )
+                  CALL ZCOPY( N, X, 1, B( N+1 ), 1 )
+                  CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 )
+                  CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
+     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
+     $                          INFO )
+*
+*                 Check error code from ZLATRS3.
+*
+                  IF( INFO.NE.0 )
+     $               CALL ALAERH( PATH, 'ZLATRS3', INFO, 0,
+     $                            UPLO // TRANS // DIAG // 'N', N, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS, NOUT )
+                  CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA,
+     $                         X, LDA, WORK, RESULT( 10 ) )
+                  CALL ZDSCAL( N, BIGNUM, X, 1 )
+                  CALL ZTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
+     $                         SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA,
+     $                         X, LDA, WORK, RES )
+                  RESULT( 10 ) = MAX( RESULT( 10 ), RES )
+*
 *                 Print information about the tests that did not pass
 *                 the threshold.
 *
@@ -552,7 +579,14 @@
      $                  DIAG, 'Y', N, IMAT, 9, RESULT( 9 )
                      NFAIL = NFAIL + 1
                   END IF
-                  NRUN = NRUN + 2
+                  IF( RESULT( 10 ).GE.THRESH ) THEN
+                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                  CALL ALAHD( NOUT, PATH )
+                     WRITE( NOUT, FMT = 9996 )'ZLATRS3', UPLO, TRANS,
+     $                  DIAG, 'N', N, IMAT, 10, RESULT( 10 )
+                     NFAIL = NFAIL + 1
+                  END IF
+                  NRUN = NRUN + 3
    90          CONTINUE
   100       CONTINUE
   110    CONTINUE
@@ -565,8 +599,8 @@
  9999 FORMAT( ' UPLO=''', A1, ''', DIAG=''', A1, ''', N=', I5, ', NB=',
      $      I4, ', type ', I2, ', test(', I2, ')= ', G12.5 )
  9998 FORMAT( ' UPLO=''', A1, ''', TRANS=''', A1, ''', DIAG=''', A1,
-     $      ''', N=', I5, ', NB=', I4, ', type ', I2, ',
-     $      test(', I2, ')= ', G12.5 )
+     $      ''', N=', I5, ', NB=', I4, ', type ', I2, ', test(',
+     $      I2, ')= ', G12.5 )
  9997 FORMAT( ' NORM=''', A1, ''', UPLO =''', A1, ''', N=', I5, ',',
      $      11X, ' type ', I2, ', test(', I2, ')=', G12.5 )
  9996 FORMAT( 1X, A, '( ''', A1, ''', ''', A1, ''', ''', A1, ''', ''',
diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f
index 098040ace..211b92154 100644
--- a/lapack-netlib/TESTING/LIN/zerrtr.f
+++ b/lapack-netlib/TESTING/LIN/zerrtr.f
@@ -82,9 +82,10 @@
       EXTERNAL           LSAMEN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS, ZTBCON,
-     $                   ZTBRFS, ZTBTRS, ZTPCON, ZTPRFS, ZTPTRI, ZTPTRS,
-     $                   ZTRCON, ZTRRFS, ZTRTI2, ZTRTRI, ZTRTRS
+      EXTERNAL           ALAESM, CHKXER, ZLATBS, ZLATPS, ZLATRS,
+     $                   ZLATRS3, ZTBCON, ZTBRFS, ZTBTRS, ZTPCON,
+     $                   ZTPRFS, ZTPTRI, ZTPTRS, ZTRCON, ZTRRFS, ZTRTI2,
+     $                   ZTRTRI, ZTRTRS
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -240,6 +241,46 @@
          CALL ZLATRS( 'U', 'N', 'N', 'N', 2, A, 1, X, SCALE, RW, INFO )
          CALL CHKXER( 'ZLATRS', INFOT, NOUT, LERR, OK )
 *
+*        ZLATRS3
+*
+         SRNAMT = 'ZLATRS3'
+         INFOT = 1
+         CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW,
+     $                 RW( 2 ), 1, INFO )
+         CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW,
+     $                 RW( 2 ), 0, INFO )
+         CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
+*
 *     Test error exits for the packed triangular routines.
 *
       ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN

From fb42a0cf8b4373de42aab691ea4b939185c8bfa3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 14 Nov 2022 14:06:50 +0100
Subject: [PATCH 089/154] Add a BLAS3-based triangular Sylvester equation
 solver (Reference-LAPACK PR 651)

---
 cmake/lapack.cmake | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 3b221d420..4c8efa11f 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -123,7 +123,8 @@ set(SLASRC
    ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
    ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
    sgesvdq.f slaorhr_col_getrfnp.f
-   slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )
+   slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f 
+   slarmm.f slatrs3.f strsyl3.f)
 
 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
    sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -221,7 +222,8 @@ set(CLASRC
    cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
    chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
    cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
-   cungtsqr.f cungtsqr_row.f cunhr_col.f )
+   cungtsqr.f cungtsqr_row.f cunhr_col.f 
+   clatrs3.f ctrsyl3.f )
 
 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
    cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -313,7 +315,8 @@ set(DLASRC
    dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
    dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
    dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
-   dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )
+   dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f 
+   dlarmm.f dlatrs3.f dtrsyl3.f)
 
 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
    dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -415,7 +418,8 @@ set(ZLASRC
    zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
    zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
    zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
-   zungtsqr.f zungtsqr_row.f zunhr_col.f)
+   zungtsqr.f zungtsqr_row.f zunhr_col.f
+   zlarts3.f ztrsyl3.f)
 
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
    zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f

From bb652f65a37a6d0bb973074136e5742b61d23cdb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 14 Nov 2022 16:35:13 +0100
Subject: [PATCH 090/154] Typo fix

---
 cmake/lapack.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 4c8efa11f..a78a89f1a 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -419,7 +419,7 @@ set(ZLASRC
    zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
    zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
    zungtsqr.f zungtsqr_row.f zunhr_col.f
-   zlarts3.f ztrsyl3.f)
+   zlatrs3.f ztrsyl3.f)
 
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
    zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f

From 52c2a0397be870f7158009a30a029222faa12f56 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 14 Nov 2022 17:13:08 +0100
Subject: [PATCH 091/154] Restore OpenBLAS modifications to link line

---
 lapack-netlib/TESTING/EIG/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt
index 3c8d9a8b2..d252c7fa9 100644
--- a/lapack-netlib/TESTING/EIG/CMakeLists.txt
+++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt
@@ -98,7 +98,8 @@ set(ZEIGTST zchkee.F
 
 macro(add_eig_executable name)
   add_executable(${name} ${ARGN})
-  target_link_libraries(${name} ${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
+  target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
+#${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 
 if(BUILD_SINGLE)

From 2592853fc72ec3358ca0f30f72326d831df515e9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 14 Nov 2022 21:47:37 +0100
Subject: [PATCH 092/154] Restore OpenBLAS-specific changes

---
 lapack-netlib/TESTING/EIG/Makefile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile
index e40358663..942ae6982 100644
--- a/lapack-netlib/TESTING/EIG/Makefile
+++ b/lapack-netlib/TESTING/EIG/Makefile
@@ -127,17 +127,17 @@ complex: xeigtstc
 double: xeigtstd
 complex16: xeigtstz
 
-xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) $(LAPACKLIB) $(BLASLIB)
-	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
+xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 $(AEIGTST): $(FRC)
 $(SCIGTST): $(FRC)

From 95da5141f0cb2c7a8aaf553f84287aef8eccf21f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 14 Nov 2022 22:21:29 +0100
Subject: [PATCH 093/154] Add a BLAS3-based triangular Sylvester equation
 solver (Reference-LAPACK PR 651)

---
 lapack-netlib/SRC/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 03d15c23c..a5d5acdf2 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -207,7 +207,7 @@ SLASRC_O = \
    ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \
    ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \
    ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \
-   sgesvdq.o 
+   sgesvdq.o slarmm.o slarts3.o strsyl3.o 
    
 endif
 
@@ -316,7 +316,7 @@ CLASRC_O = \
    chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \
    cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \
    chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \
-   cgesvdq.o
+   cgesvdq.o clarts3.o ctrsyl3.o
 endif
 
 ifdef USEXBLAS
@@ -417,7 +417,7 @@ DLASRC_O = \
    dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \
    dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \
    dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \
-   dgesvdq.o 
+   dgesvdq.o dlarmm.o dlatrs3.o dtrsyl3.o
 endif
 
 ifdef USEXBLAS
@@ -526,7 +526,7 @@ ZLASRC_O = \
    zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
    zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \
    zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \
-   zgesvdq.o
+   zgesvdq.o zlatrs3.o ztrsyl3.o
 endif
 
 ifdef USEXBLAS

From 379efbe5af02059375c1eb1d312834789e17d13e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 15 Nov 2022 11:03:12 +0100
Subject: [PATCH 094/154] Fix typos

---
 lapack-netlib/SRC/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index a5d5acdf2..49eb69cfe 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -207,7 +207,7 @@ SLASRC_O = \
    ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \
    ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \
    ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \
-   sgesvdq.o slarmm.o slarts3.o strsyl3.o 
+   sgesvdq.o slarmm.o slatrs3.o strsyl3.o 
    
 endif
 
@@ -316,7 +316,7 @@ CLASRC_O = \
    chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \
    cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \
    chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \
-   cgesvdq.o clarts3.o ctrsyl3.o
+   cgesvdq.o clatrs3.o ctrsyl3.o
 endif
 
 ifdef USEXBLAS

From b2cc310470a91165cda1e4da17426c4babf1845a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 15 Nov 2022 14:23:46 +0100
Subject: [PATCH 095/154] Add f2c-converted versions of the new BLAS3-based
 Sylvester solver

---
 lapack-netlib/SRC/clatrs3.c | 1155 ++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/ctrsyl3.c |  381 ++++++++++++
 lapack-netlib/SRC/dlarmm.c  |  478 +++++++++++++++
 lapack-netlib/SRC/dlatrs3.c | 1138 ++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dtrsyl3.c |  381 ++++++++++++
 lapack-netlib/SRC/slarmm.c  |  478 +++++++++++++++
 lapack-netlib/SRC/slatrs3.c | 1135 ++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/strsyl3.c |  381 ++++++++++++
 lapack-netlib/SRC/zlatrs3.c | 1157 +++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/ztrsyl3.c |  381 ++++++++++++
 10 files changed, 7065 insertions(+)
 create mode 100644 lapack-netlib/SRC/clatrs3.c
 create mode 100644 lapack-netlib/SRC/ctrsyl3.c
 create mode 100644 lapack-netlib/SRC/dlarmm.c
 create mode 100644 lapack-netlib/SRC/dlatrs3.c
 create mode 100644 lapack-netlib/SRC/dtrsyl3.c
 create mode 100644 lapack-netlib/SRC/slarmm.c
 create mode 100644 lapack-netlib/SRC/slatrs3.c
 create mode 100644 lapack-netlib/SRC/strsyl3.c
 create mode 100644 lapack-netlib/SRC/zlatrs3.c
 create mode 100644 lapack-netlib/SRC/ztrsyl3.c

diff --git a/lapack-netlib/SRC/clatrs3.c b/lapack-netlib/SRC/clatrs3.c
new file mode 100644
index 000000000..6124a7f19
--- /dev/null
+++ b/lapack-netlib/SRC/clatrs3.c
@@ -0,0 +1,1155 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimag(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) ceil(w)
+#define myhuge_(w) HUGE_VAL
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static complex c_b2 = {1.f,0.f};
+static integer c__1 = 1;
+static integer c_n1 = -1;
+
+/* > \brief \b CLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
+ */
+
+/*  Definition: */
+/*  =========== */
+
+/*      SUBROUTINE CLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */
+/*                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */
+
+/*       CHARACTER          DIAG, NORMIN, TRANS, UPLO */
+/*       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS */
+/*       REAL               CNORM( * ), SCALE( * ), WORK( * ) */
+/*       COMPLEX            A( LDA, * ), X( LDX, * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > CLATRS3 solves one of the triangular systems */
+/* > */
+/* >    A * X = B * diag(scale),  A**T * X = B * diag(scale), or */
+/* >    A**H * X = B * diag(scale) */
+/* > */
+/* > with scaling to prevent overflow.  Here A is an upper or lower */
+/* > triangular matrix, A**T denotes the transpose of A, A**H denotes the */
+/* > conjugate transpose of A. X and B are n-by-nrhs matrices and scale */
+/* > is an nrhs-element vector of scaling factors. A scaling factor scale(j) */
+/* > is usually less than or equal to 1, chosen such that X(:,j) is less */
+/* > than the overflow threshold. If the matrix A is singular (A(j,j) = 0 */
+/* > for some j), then a non-trivial solution to A*X = 0 is returned. If */
+/* > the system is so badly scaled that the solution cannot be represented */
+/* > as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */
+/* > */
+/* > This is a BLAS-3 version of LATRS for solving several right */
+/* > hand sides simultaneously. */
+/* > */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] UPLO */
+/* > \verbatim */
+/* >          UPLO is CHARACTER*1 */
+/* >          Specifies whether the matrix A is upper or lower triangular. */
+/* >          = 'U':  Upper triangular */
+/* >          = 'L':  Lower triangular */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TRANS */
+/* > \verbatim */
+/* >          TRANS is CHARACTER*1 */
+/* >          Specifies the operation applied to A. */
+/* >          = 'N':  Solve A * x = s*b  (No transpose) */
+/* >          = 'T':  Solve A**T* x = s*b  (Transpose) */
+/* >          = 'C':  Solve A**T* x = s*b  (Conjugate transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] DIAG */
+/* > \verbatim */
+/* >          DIAG is CHARACTER*1 */
+/* >          Specifies whether or not the matrix A is unit triangular. */
+/* >          = 'N':  Non-unit triangular */
+/* >          = 'U':  Unit triangular */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NORMIN */
+/* > \verbatim */
+/* >          NORMIN is CHARACTER*1 */
+/* >          Specifies whether CNORM has been set or not. */
+/* >          = 'Y':  CNORM contains the column norms on entry */
+/* >          = 'N':  CNORM is not set on entry.  On exit, the norms will */
+/* >                  be computed and stored in CNORM. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The order of the matrix A.  N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of X.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX array, dimension (LDA,N) */
+/* >          The triangular matrix A.  If UPLO = 'U', the leading n by n */
+/* >          upper triangular part of the array A contains the upper */
+/* >          triangular matrix, and the strictly lower triangular part of */
+/* >          A is not referenced.  If UPLO = 'L', the leading n by n lower */
+/* >          triangular part of the array A contains the lower triangular */
+/* >          matrix, and the strictly upper triangular part of A is not */
+/* >          referenced.  If DIAG = 'U', the diagonal elements of A are */
+/* >          also not referenced and are assumed to be 1. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= f2cmax (1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is COMPLEX array, dimension (LDX,NRHS) */
+/* >          On entry, the right hand side B of the triangular system. */
+/* >          On exit, X is overwritten by the solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDX */
+/* > \verbatim */
+/* >          LDX is INTEGER */
+/* >          The leading dimension of the array X.  LDX >= f2cmax (1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SCALE */
+/* > \verbatim */
+/* >          SCALE is REAL array, dimension (NRHS) */
+/* >          The scaling factor s(k) is for the triangular system */
+/* >          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k). */
+/* >          If SCALE = 0, the matrix A is singular or badly scaled. */
+/* >          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */
+/* >          that is an exact or approximate solution to A*x(:,k) = 0 */
+/* >          is returned. If the system so badly scaled that solution */
+/* >          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */
+/* >          is returned. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CNORM */
+/* > \verbatim */
+/* >          CNORM is REAL array, dimension (N) */
+/* > */
+/* >          If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */
+/* >          contains the norm of the off-diagonal part of the j-th column */
+/* >          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal */
+/* >          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */
+/* >          must be greater than or equal to the 1-norm. */
+/* > */
+/* >          If NORMIN = 'N', CNORM is an output argument and CNORM(j) */
+/* >          returns the 1-norm of the offdiagonal part of the j-th column */
+/* >          of A. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is REAL array, dimension (LWORK). */
+/* >          On exit, if INFO = 0, WORK(1) returns the optimal size of */
+/* >          WORK. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* >          LWORK is INTEGER */
+/* >          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */
+/* >          NBA = (N + NB - 1)/NB and NB is the optimal block size. */
+/* > */
+/* >          If LWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal dimensions of the WORK array, returns */
+/* >          this value as the first entry of the WORK array, and no error */
+/* >          message related to LWORK is issued by XERBLA. */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0:  successful exit */
+/* >          < 0:  if INFO = -k, the k-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup doubleOTHERauxiliary */
+/* > \par Further Details: */
+/*  ===================== */
+/*  \verbatim */
+/*  The algorithm follows the structure of a block triangular solve. */
+/*  The diagonal block is solved with a call to the robust the triangular */
+/*  solver LATRS for every right-hand side RHS = 1, ..., NRHS */
+/*     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */
+/*  where op( A ) = A or op( A ) = A**T or op( A ) = A**H. */
+/*  The linear block updates operate on block columns of X, */
+/*     B( I, K ) - op(A( I, J )) * X( J, K ) */
+/*  and use GEMM. To avoid overflow in the linear block update, the worst case */
+/*  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */
+/*  such that */
+/*     || s * B( I, RHS )||_oo */
+/*   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold */
+
+/*  Once all columns of a block column have been rescaled (BLAS-1), the linear */
+/*  update is executed with GEMM without overflow. */
+
+/*  To limit rescaling, local scale factors track the scaling of column segments. */
+/*  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */
+/*  per right-hand side column RHS = 1, ..., NRHS. The global scale factor */
+/*  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */
+/*  I = 1, ..., NBA. */
+/*  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */
+/*  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */
+/*  linear update of potentially inconsistently scaled vector segments */
+/*     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */
+/*  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */
+/*  if necessary, rescales the blocks prior to calling GEMM. */
+
+/*  \endverbatim */
+/*  ===================================================================== */
+/*  References: */
+/*  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */
+/*  Parallel robust solution of triangular linear systems. Concurrency */
+/*  and Computation: Practice and Experience, 31(19), e5064. */
+
+/*  Contributor: */
+/*   Angelika Schwarz, Umea University, Sweden. */
+
+/*  ===================================================================== */
+/* Subroutine */ int clatrs3_(char *uplo, char *trans, char *diag, char *
+	normin, integer *n, integer *nrhs, complex *a, integer *lda, complex *
+	x, integer *ldx, real *scale, real *cnorm, real *work, integer *lwork,
+	 integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, 
+	    i__6, i__7, i__8;
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    integer iinc, jinc;
+    real scal, anrm, bnrm;
+    integer awrk;
+    real tmax, xnrm[32];
+    integer i__, j, k;
+    real w[64];
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *);
+    extern logical lsame_(char *, char *);
+    real rscal;
+    integer lanrm, ilast, jlast, i1;
+    logical upper;
+    integer i2, j1, j2, k1, k2, nb, ii, kk;
+    extern real clange_(char *, integer *, integer *, complex *, integer *, 
+	    real *);
+    integer lscale;
+    real scaloc;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
+	    *);
+    real scamin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    real bignum;
+    extern /* Subroutine */ int clatrs_(char *, char *, char *, char *, 
+	    integer *, complex *, integer *, complex *, real *, real *, 
+	    integer *);
+    extern real slarmm_(real *, real *, real *);
+    integer ifirst;
+    logical notran;
+    integer jfirst;
+    real smlnum;
+    logical nounit, lquery;
+    integer nba, lds, nbx, rhs;
+
+
+
+/*  ===================================================================== */
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    --scale;
+    --cnorm;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    notran = lsame_(trans, "N");
+    nounit = lsame_(diag, "N");
+    lquery = *lwork == -1;
+
+/*     Partition A and X into blocks. */
+
+/* Computing MAX */
+    i__1 = 8, i__2 = ilaenv_(&c__1, "CLATRS", "", n, n, &c_n1, &c_n1, (ftnlen)
+	    6, (ftnlen)0);
+    nb = f2cmax(i__1,i__2);
+    nb = f2cmin(64,nb);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*n + nb - 1) / nb;
+    nba = f2cmax(i__1,i__2);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*nrhs + 31) / 32;
+    nbx = f2cmax(i__1,i__2);
+
+/*     Compute the workspace */
+
+/*     The workspace comprises two parts. */
+/*     The first part stores the local scale factors. Each simultaneously */
+/*     computed right-hand side requires one local scale factor per block */
+/*     row. WORK( I + KK * LDS ) is the scale factor of the vector */
+/*     segment associated with the I-th block row and the KK-th vector */
+/*     in the block column. */
+/* Computing MAX */
+    i__1 = nba, i__2 = f2cmin(*nrhs,32);
+    lscale = nba * f2cmax(i__1,i__2);
+    lds = nba;
+/*     The second part stores upper bounds of the triangular A. There are */
+/*     a total of NBA x NBA blocks, of which only the upper triangular */
+/*     part or the lower triangular part is referenced. The upper bound of */
+/*     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */
+    lanrm = nba * nba;
+    awrk = lscale;
+    work[1] = (real) (lscale + lanrm);
+
+/*     Test the input parameters. */
+
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T") && ! 
+	    lsame_(trans, "C")) {
+	*info = -2;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -3;
+    } else if (! lsame_(normin, "Y") && ! lsame_(normin,
+	     "N")) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*nrhs < 0) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*ldx < f2cmax(1,*n)) {
+	*info = -10;
+    } else if (! lquery && (real) (*lwork) < work[1]) {
+	*info = -14;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLATRS3", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Initialize scaling factors */
+
+    i__1 = *nrhs;
+    for (kk = 1; kk <= i__1; ++kk) {
+	scale[kk] = 1.f;
+    }
+
+/*     Quick return if possible */
+
+    if (f2cmin(*n,*nrhs) == 0) {
+	return 0;
+    }
+
+/*     Determine machine dependent constant to control overflow. */
+
+    bignum = slamch_("Overflow");
+    smlnum = slamch_("Safe Minimum");
+
+/*     Use unblocked code for small problems */
+
+    if (*nrhs < 2) {
+	clatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + 
+		1], &scale[1], &cnorm[1], info);
+	i__1 = *nrhs;
+	for (k = 2; k <= i__1; ++k) {
+	    clatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * 
+		    x_dim1 + 1], &scale[k], &cnorm[1], info);
+	}
+	return 0;
+    }
+
+/*     Compute norms of blocks of A excluding diagonal blocks and find */
+/*     the block with the largest norm TMAX. */
+
+    tmax = 0.f;
+    i__1 = nba;
+    for (j = 1; j <= i__1; ++j) {
+	j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+	i__2 = j * nb;
+	j2 = f2cmin(i__2,*n) + 1;
+	if (upper) {
+	    ifirst = 1;
+	    ilast = j - 1;
+	} else {
+	    ifirst = j + 1;
+	    ilast = nba;
+	}
+	i__2 = ilast;
+	for (i__ = ifirst; i__ <= i__2; ++i__) {
+	    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+	    i__3 = i__ * nb;
+	    i2 = f2cmin(i__3,*n) + 1;
+
+/*           Compute upper bound of A( I1:I2-1, J1:J2-1 ). */
+
+	    if (notran) {
+		i__3 = i2 - i1;
+		i__4 = j2 - j1;
+		anrm = clange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, 
+			w);
+		work[awrk + i__ + (j - 1) * nba] = anrm;
+	    } else {
+		i__3 = i2 - i1;
+		i__4 = j2 - j1;
+		anrm = clange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, 
+			w);
+		work[awrk + j + (i__ - 1) * nba] = anrm;
+	    }
+	    tmax = f2cmax(tmax,anrm);
+	}
+    }
+
+    if (! (tmax <= slamch_("Overflow"))) {
+
+/*        Some matrix entries have huge absolute value. At least one upper */
+/*        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */
+/*        number, either due to overflow in LANGE or due to Inf in A. */
+/*        Fall back to LATRS. Set normin = 'N' for every right-hand side to */
+/*        force computation of TSCAL in LATRS to avoid the likely overflow */
+/*        in the computation of the column norms CNORM. */
+
+	i__1 = *nrhs;
+	for (k = 1; k <= i__1; ++k) {
+	    clatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * 
+		    x_dim1 + 1], &scale[k], &cnorm[1], info);
+	}
+	return 0;
+    }
+
+/*     Every right-hand side requires workspace to store NBA local scale */
+/*     factors. To save workspace, X is computed successively in block columns */
+/*     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */
+/*     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */
+    i__1 = nbx;
+    for (k = 1; k <= i__1; ++k) {
+/*        Loop over block columns (index = K) of X and, for column-wise scalings, */
+/*        over individual columns (index = KK). */
+/*        K1: column index of the first column in X( J, K ) */
+/*        K2: column index of the first column in X( J, K+1 ) */
+/*        so the K2 - K1 is the column count of the block X( J, K ) */
+	k1 = (k - 1 << 5) + 1;
+/* Computing MIN */
+	i__2 = k << 5;
+	k2 = f2cmin(i__2,*nrhs) + 1;
+
+/*        Initialize local scaling factors of current block column X( J, K ) */
+
+	i__2 = k2 - k1;
+	for (kk = 1; kk <= i__2; ++kk) {
+	    i__3 = nba;
+	    for (i__ = 1; i__ <= i__3; ++i__) {
+		work[i__ + kk * lds] = 1.f;
+	    }
+	}
+
+	if (notran) {
+
+/*           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */
+
+	    if (upper) {
+		jfirst = nba;
+		jlast = 1;
+		jinc = -1;
+	    } else {
+		jfirst = 1;
+		jlast = nba;
+		jinc = 1;
+	    }
+	} else {
+
+/*           Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */
+/*           where op(A) = A**T or op(A) = A**H */
+
+	    if (upper) {
+		jfirst = 1;
+		jlast = nba;
+		jinc = 1;
+	    } else {
+		jfirst = nba;
+		jlast = 1;
+		jinc = -1;
+	    }
+	}
+	i__2 = jlast;
+	i__3 = jinc;
+	for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) {
+/*           J1: row index of the first row in A( J, J ) */
+/*           J2: row index of the first row in A( J+1, J+1 ) */
+/*           so that J2 - J1 is the row count of the block A( J, J ) */
+	    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+	    i__4 = j * nb;
+	    j2 = f2cmin(i__4,*n) + 1;
+
+/*           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */
+
+	    i__4 = k2 - k1;
+	    for (kk = 1; kk <= i__4; ++kk) {
+		rhs = k1 + kk - 1;
+		if (kk == 1) {
+		    i__5 = j2 - j1;
+		    clatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * 
+			    a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, &
+			    cnorm[1], info);
+		} else {
+		    i__5 = j2 - j1;
+		    clatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * 
+			    a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, &
+			    cnorm[1], info);
+		}
+/*              Find largest absolute value entry in the vector segment */
+/*              X( J1:J2-1, RHS ) as an upper bound for the worst case */
+/*              growth in the linear updates. */
+		i__5 = j2 - j1;
+		xnrm[kk - 1] = clange_("I", &i__5, &c__1, &x[j1 + rhs * 
+			x_dim1], ldx, w);
+
+		if (scaloc == 0.f) {
+/*                 LATRS found that A is singular through A(j,j) = 0. */
+/*                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */
+/*                 and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is */
+/*                 set by LATRS. */
+		    scale[rhs] = 0.f;
+		    i__5 = j1 - 1;
+		    for (ii = 1; ii <= i__5; ++ii) {
+			i__6 = ii + kk * x_dim1;
+			x[i__6].r = 0.f, x[i__6].i = 0.f;
+		    }
+		    i__5 = *n;
+		    for (ii = j2; ii <= i__5; ++ii) {
+			i__6 = ii + kk * x_dim1;
+			x[i__6].r = 0.f, x[i__6].i = 0.f;
+		    }
+/*                 Discard the local scale factors. */
+		    i__5 = nba;
+		    for (ii = 1; ii <= i__5; ++ii) {
+			work[ii + kk * lds] = 1.f;
+		    }
+		    scaloc = 1.f;
+		} else if (scaloc * work[j + kk * lds] == 0.f) {
+/*                 LATRS computed a valid scale factor, but combined with */
+/*                 the current scaling the solution does not have a */
+/*                 scale factor > 0. */
+
+/*                 Set WORK( J+KK*LDS ) to smallest valid scale */
+/*                 factor and increase SCALOC accordingly. */
+		    scal = work[j + kk * lds] / smlnum;
+		    scaloc *= scal;
+		    work[j + kk * lds] = smlnum;
+/*                 If LATRS overestimated the growth, x may be */
+/*                 rescaled to preserve a valid combined scale */
+/*                 factor WORK( J, KK ) > 0. */
+		    rscal = 1.f / scaloc;
+		    if (xnrm[kk - 1] * rscal <= bignum) {
+			xnrm[kk - 1] *= rscal;
+			i__5 = j2 - j1;
+			csscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1);
+			scaloc = 1.f;
+		    } else {
+/*                    The system op(A) * x = b is badly scaled and its */
+/*                    solution cannot be represented as (1/scale) * x. */
+/*                    Set x to zero. This approach deviates from LATRS */
+/*                    where a completely meaningless non-zero vector */
+/*                    is returned that is not a solution to op(A) * x = b. */
+			scale[rhs] = 0.f;
+			i__5 = *n;
+			for (ii = 1; ii <= i__5; ++ii) {
+			    i__6 = ii + kk * x_dim1;
+			    x[i__6].r = 0.f, x[i__6].i = 0.f;
+			}
+/*                    Discard the local scale factors. */
+			i__5 = nba;
+			for (ii = 1; ii <= i__5; ++ii) {
+			    work[ii + kk * lds] = 1.f;
+			}
+			scaloc = 1.f;
+		    }
+		}
+		scaloc *= work[j + kk * lds];
+		work[j + kk * lds] = scaloc;
+	    }
+
+/*           Linear block updates */
+
+	    if (notran) {
+		if (upper) {
+		    ifirst = j - 1;
+		    ilast = 1;
+		    iinc = -1;
+		} else {
+		    ifirst = j + 1;
+		    ilast = nba;
+		    iinc = 1;
+		}
+	    } else {
+		if (upper) {
+		    ifirst = j + 1;
+		    ilast = nba;
+		    iinc = 1;
+		} else {
+		    ifirst = j - 1;
+		    ilast = 1;
+		    iinc = -1;
+		}
+	    }
+
+	    i__4 = ilast;
+	    i__5 = iinc;
+	    for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += 
+		    i__5) {
+/*              I1: row index of the first column in X( I, K ) */
+/*              I2: row index of the first column in X( I+1, K ) */
+/*              so the I2 - I1 is the row count of the block X( I, K ) */
+		i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		i__6 = i__ * nb;
+		i2 = f2cmin(i__6,*n) + 1;
+
+/*              Prepare the linear update to be executed with GEMM. */
+/*              For each column, compute a consistent scaling, a */
+/*              scaling factor to survive the linear update, and */
+/*              rescale the column segments, if necesssary. Then */
+/*              the linear update is safely executed. */
+
+		i__6 = k2 - k1;
+		for (kk = 1; kk <= i__6; ++kk) {
+		    rhs = k1 + kk - 1;
+/*                 Compute consistent scaling */
+/* Computing MIN */
+		    r__1 = work[i__ + kk * lds], r__2 = work[j + kk * lds];
+		    scamin = f2cmin(r__1,r__2);
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__7 = i2 - i1;
+		    bnrm = clange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], 
+			    ldx, w);
+		    bnrm *= scamin / work[i__ + kk * lds];
+		    xnrm[kk - 1] *= scamin / work[j + kk * lds];
+		    anrm = work[awrk + i__ + (j - 1) * nba];
+		    scaloc = slarmm_(&anrm, &xnrm[kk - 1], &bnrm);
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to X( I, KK ) and X( J, KK ). */
+
+		    scal = scamin / work[i__ + kk * lds] * scaloc;
+		    if (scal != 1.f) {
+			i__7 = i2 - i1;
+			csscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1);
+			work[i__ + kk * lds] = scamin * scaloc;
+		    }
+
+		    scal = scamin / work[j + kk * lds] * scaloc;
+		    if (scal != 1.f) {
+			i__7 = j2 - j1;
+			csscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1);
+			work[j + kk * lds] = scamin * scaloc;
+		    }
+		}
+
+		if (notran) {
+
+/*                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */
+
+		    i__6 = i2 - i1;
+		    i__7 = k2 - k1;
+		    i__8 = j2 - j1;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("N", "N", &i__6, &i__7, &i__8, &q__1, &a[i1 + j1 * 
+			    a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, &
+			    x[i1 + k1 * x_dim1], ldx);
+		} else if (lsame_(trans, "T")) {
+
+/*                 B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */
+
+		    i__6 = i2 - i1;
+		    i__7 = k2 - k1;
+		    i__8 = j2 - j1;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("T", "N", &i__6, &i__7, &i__8, &q__1, &a[j1 + i1 * 
+			    a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, &
+			    x[i1 + k1 * x_dim1], ldx);
+		} else {
+
+/*                 B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) */
+
+		    i__6 = i2 - i1;
+		    i__7 = k2 - k1;
+		    i__8 = j2 - j1;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("C", "N", &i__6, &i__7, &i__8, &q__1, &a[j1 + i1 * 
+			    a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b2, &
+			    x[i1 + k1 * x_dim1], ldx);
+		}
+	    }
+	}
+
+/*        Reduce local scaling factors */
+
+	i__3 = k2 - k1;
+	for (kk = 1; kk <= i__3; ++kk) {
+	    rhs = k1 + kk - 1;
+	    i__2 = nba;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MIN */
+		r__1 = scale[rhs], r__2 = work[i__ + kk * lds];
+		scale[rhs] = f2cmin(r__1,r__2);
+	    }
+	}
+
+/*        Realize consistent scaling */
+
+	i__3 = k2 - k1;
+	for (kk = 1; kk <= i__3; ++kk) {
+	    rhs = k1 + kk - 1;
+	    if (scale[rhs] != 1.f && scale[rhs] != 0.f) {
+		i__2 = nba;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__5 = i__ * nb;
+		    i2 = f2cmin(i__5,*n) + 1;
+		    scal = scale[rhs] / work[i__ + kk * lds];
+		    if (scal != 1.f) {
+			i__5 = i2 - i1;
+			csscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1);
+		    }
+		}
+	    }
+	}
+    }
+    return 0;
+
+/*     End of CLATRS3 */
+
+} /* clatrs3_ */
+
diff --git a/lapack-netlib/SRC/ctrsyl3.c b/lapack-netlib/SRC/ctrsyl3.c
new file mode 100644
index 000000000..d05923a46
--- /dev/null
+++ b/lapack-netlib/SRC/ctrsyl3.c
@@ -0,0 +1,381 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimag(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) ceil(w)
+#define myhuge_(w) HUGE_VAL
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
diff --git a/lapack-netlib/SRC/dlarmm.c b/lapack-netlib/SRC/dlarmm.c
new file mode 100644
index 000000000..a440586d3
--- /dev/null
+++ b/lapack-netlib/SRC/dlarmm.c
@@ -0,0 +1,478 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimag(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) ceil(w)
+#define myhuge_(w) HUGE_VAL
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* > \brief \b DLARMM */
+
+/* Definition: */
+/* =========== */
+
+/*      DOUBLE PRECISION FUNCTION DLARMM( ANORM, BNORM, CNORM ) */
+
+/*      DOUBLE PRECISION   ANORM, BNORM, CNORM */
+
+/* >  \par Purpose: */
+/*  ======= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > DLARMM returns a factor s in (0, 1] such that the linear updates */
+/* > */
+/* >    (s * C) - A * (s * B)  and  (s * C) - (s * A) * B */
+/* > */
+/* > cannot overflow, where A, B, and C are matrices of conforming */
+/* > dimensions. */
+/* > */
+/* > This is an auxiliary routine so there is no argument checking. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========= */
+
+/* > \param[in] ANORM */
+/* > \verbatim */
+/* >          ANORM is DOUBLE PRECISION */
+/* >          The infinity norm of A. ANORM >= 0. */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] BNORM */
+/* > \verbatim */
+/* >          BNORM is DOUBLE PRECISION */
+/* >          The infinity norm of B. BNORM >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] CNORM */
+/* > \verbatim */
+/* >          CNORM is DOUBLE PRECISION */
+/* >          The infinity norm of C. CNORM >= 0. */
+/* > \endverbatim */
+/* > */
+/* > */
+/*  ===================================================================== */
+/* >  References: */
+/* >    C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */
+/* >    Robust Solution of Triangular Linear Systems. In: International */
+/* >    Conference on Parallel Processing and Applied Mathematics, pages */
+/* >    68--78. Springer, 2017. */
+/* > */
+/* > \ingroup OTHERauxiliary */
+/*  ===================================================================== */
+doublereal dlarmm_(doublereal *anorm, doublereal *bnorm, doublereal *cnorm)
+{
+    /* System generated locals */
+    doublereal ret_val;
+
+    /* Local variables */
+    extern doublereal dlamch_(char *);
+    doublereal bignum, smlnum;
+
+
+
+/*     Determine machine dependent parameters to control overflow. */
+
+    smlnum = dlamch_("Safe minimum") / dlamch_("Precision");
+    bignum = 1. / smlnum / 4.;
+
+/*     Compute a scale factor. */
+
+    ret_val = 1.;
+    if (*bnorm <= 1.) {
+	if (*anorm * *bnorm > bignum - *cnorm) {
+	    ret_val = .5;
+	}
+    } else {
+	if (*anorm > (bignum - *cnorm) / *bnorm) {
+	    ret_val = .5 / *bnorm;
+	}
+    }
+    return ret_val;
+
+/*     ==== End of DLARMM ==== */
+
+} /* dlarmm_ */
+
diff --git a/lapack-netlib/SRC/dlatrs3.c b/lapack-netlib/SRC/dlatrs3.c
new file mode 100644
index 000000000..b6e15eb12
--- /dev/null
+++ b/lapack-netlib/SRC/dlatrs3.c
@@ -0,0 +1,1138 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimag(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) ceil(w)
+#define myhuge_(w) HUGE_VAL
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static doublereal c_b35 = -1.;
+static doublereal c_b36 = 1.;
+
+/* > \brief \b DLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
+ */
+
+/*  Definition: */
+/*  =========== */
+
+/*      SUBROUTINE DLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */
+/*                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */
+
+/*       CHARACTER          DIAG, NORMIN, TRANS, UPLO */
+/*       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS */
+/*       DOUBLE PRECISION   A( LDA, * ), CNORM( * ), SCALE( * ), */
+/*                          WORK( * ), X( LDX, * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > DLATRS3 solves one of the triangular systems */
+/* > */
+/* >    A * X = B * diag(scale)  or  A**T * X = B * diag(scale) */
+/* > */
+/* > with scaling to prevent overflow.  Here A is an upper or lower */
+/* > triangular matrix, A**T denotes the transpose of A. X and B are */
+/* > n by nrhs matrices and scale is an nrhs element vector of scaling */
+/* > factors. A scaling factor scale(j) is usually less than or equal */
+/* > to 1, chosen such that X(:,j) is less than the overflow threshold. */
+/* > If the matrix A is singular (A(j,j) = 0 for some j), then */
+/* > a non-trivial solution to A*X = 0 is returned. If the system is */
+/* > so badly scaled that the solution cannot be represented as */
+/* > (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */
+/* > */
+/* > This is a BLAS-3 version of LATRS for solving several right */
+/* > hand sides simultaneously. */
+/* > */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] UPLO */
+/* > \verbatim */
+/* >          UPLO is CHARACTER*1 */
+/* >          Specifies whether the matrix A is upper or lower triangular. */
+/* >          = 'U':  Upper triangular */
+/* >          = 'L':  Lower triangular */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TRANS */
+/* > \verbatim */
+/* >          TRANS is CHARACTER*1 */
+/* >          Specifies the operation applied to A. */
+/* >          = 'N':  Solve A * x = s*b  (No transpose) */
+/* >          = 'T':  Solve A**T* x = s*b  (Transpose) */
+/* >          = 'C':  Solve A**T* x = s*b  (Conjugate transpose = Transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] DIAG */
+/* > \verbatim */
+/* >          DIAG is CHARACTER*1 */
+/* >          Specifies whether or not the matrix A is unit triangular. */
+/* >          = 'N':  Non-unit triangular */
+/* >          = 'U':  Unit triangular */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NORMIN */
+/* > \verbatim */
+/* >          NORMIN is CHARACTER*1 */
+/* >          Specifies whether CNORM has been set or not. */
+/* >          = 'Y':  CNORM contains the column norms on entry */
+/* >          = 'N':  CNORM is not set on entry.  On exit, the norms will */
+/* >                  be computed and stored in CNORM. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The order of the matrix A.  N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of X.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is DOUBLE PRECISION array, dimension (LDA,N) */
+/* >          The triangular matrix A.  If UPLO = 'U', the leading n by n */
+/* >          upper triangular part of the array A contains the upper */
+/* >          triangular matrix, and the strictly lower triangular part of */
+/* >          A is not referenced.  If UPLO = 'L', the leading n by n lower */
+/* >          triangular part of the array A contains the lower triangular */
+/* >          matrix, and the strictly upper triangular part of A is not */
+/* >          referenced.  If DIAG = 'U', the diagonal elements of A are */
+/* >          also not referenced and are assumed to be 1. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= f2cmax (1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is DOUBLE PRECISION array, dimension (LDX,NRHS) */
+/* >          On entry, the right hand side B of the triangular system. */
+/* >          On exit, X is overwritten by the solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDX */
+/* > \verbatim */
+/* >          LDX is INTEGER */
+/* >          The leading dimension of the array X.  LDX >= f2cmax (1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SCALE */
+/* > \verbatim */
+/* >          SCALE is DOUBLE PRECISION array, dimension (NRHS) */
+/* >          The scaling factor s(k) is for the triangular system */
+/* >          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k). */
+/* >          If SCALE = 0, the matrix A is singular or badly scaled. */
+/* >          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */
+/* >          that is an exact or approximate solution to A*x(:,k) = 0 */
+/* >          is returned. If the system so badly scaled that solution */
+/* >          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */
+/* >          is returned. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CNORM */
+/* > \verbatim */
+/* >          CNORM is DOUBLE PRECISION array, dimension (N) */
+/* > */
+/* >          If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */
+/* >          contains the norm of the off-diagonal part of the j-th column */
+/* >          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal */
+/* >          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */
+/* >          must be greater than or equal to the 1-norm. */
+/* > */
+/* >          If NORMIN = 'N', CNORM is an output argument and CNORM(j) */
+/* >          returns the 1-norm of the offdiagonal part of the j-th column */
+/* >          of A. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is DOUBLE PRECISION array, dimension (LWORK). */
+/* >          On exit, if INFO = 0, WORK(1) returns the optimal size of */
+/* >          WORK. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* >          LWORK is INTEGER */
+/* >          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */
+/* >          NBA = (N + NB - 1)/NB and NB is the optimal block size. */
+/* > */
+/* >          If LWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal dimensions of the WORK array, returns */
+/* >          this value as the first entry of the WORK array, and no error */
+/* >          message related to LWORK is issued by XERBLA. */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0:  successful exit */
+/* >          < 0:  if INFO = -k, the k-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup doubleOTHERauxiliary */
+/* > \par Further Details: */
+/*  ===================== */
+/*  \verbatim */
+/*  The algorithm follows the structure of a block triangular solve. */
+/*  The diagonal block is solved with a call to the robust the triangular */
+/*  solver LATRS for every right-hand side RHS = 1, ..., NRHS */
+/*     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */
+/*  where op( A ) = A or op( A ) = A**T. */
+/*  The linear block updates operate on block columns of X, */
+/*     B( I, K ) - op(A( I, J )) * X( J, K ) */
+/*  and use GEMM. To avoid overflow in the linear block update, the worst case */
+/*  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */
+/*  such that */
+/*     || s * B( I, RHS )||_oo */
+/*   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold */
+
+/*  Once all columns of a block column have been rescaled (BLAS-1), the linear */
+/*  update is executed with GEMM without overflow. */
+
+/*  To limit rescaling, local scale factors track the scaling of column segments. */
+/*  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */
+/*  per right-hand side column RHS = 1, ..., NRHS. The global scale factor */
+/*  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */
+/*  I = 1, ..., NBA. */
+/*  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */
+/*  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */
+/*  linear update of potentially inconsistently scaled vector segments */
+/*     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */
+/*  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */
+/*  if necessary, rescales the blocks prior to calling GEMM. */
+
+/*  \endverbatim */
+/*  ===================================================================== */
+/*  References: */
+/*  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */
+/*  Parallel robust solution of triangular linear systems. Concurrency */
+/*  and Computation: Practice and Experience, 31(19), e5064. */
+
+/*  Contributor: */
+/*   Angelika Schwarz, Umea University, Sweden. */
+
+/*  ===================================================================== */
+/* Subroutine */ int dlatrs3_(char *uplo, char *trans, char *diag, char *
+	normin, integer *n, integer *nrhs, doublereal *a, integer *lda, 
+	doublereal *x, integer *ldx, doublereal *scale, doublereal *cnorm, 
+	doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, 
+	    i__6, i__7, i__8;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    integer iinc, jinc;
+    doublereal scal, anrm, bnrm;
+    integer awrk;
+    doublereal tmax, xnrm[32];
+    integer i__, j, k;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, 
+	    integer *);
+    doublereal w[64];
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *);
+    extern logical lsame_(char *, char *);
+    doublereal rscal;
+    integer lanrm, ilast, jlast, i1;
+    logical upper;
+    integer i2, j1, j2, k1, k2, nb, ii, kk;
+    extern doublereal dlamch_(char *), dlange_(char *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *);
+    integer lscale;
+    doublereal scaloc, scamin;
+    extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    doublereal bignum;
+    extern /* Subroutine */ int dlatrs_(char *, char *, char *, char *, 
+	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
+	    doublereal *, integer *);
+    integer ifirst;
+    logical notran;
+    integer jfirst;
+    doublereal smlnum;
+    logical nounit, lquery;
+    integer nba, lds, nbx, rhs;
+
+
+
+/*  ===================================================================== */
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    --scale;
+    --cnorm;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    notran = lsame_(trans, "N");
+    nounit = lsame_(diag, "N");
+    lquery = *lwork == -1;
+
+/*     Partition A and X into blocks */
+
+/* Computing MAX */
+    i__1 = 8, i__2 = ilaenv_(&c__1, "DLATRS", "", n, n, &c_n1, &c_n1, (ftnlen)
+	    6, (ftnlen)0);
+    nb = f2cmax(i__1,i__2);
+    nb = f2cmin(64,nb);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*n + nb - 1) / nb;
+    nba = f2cmax(i__1,i__2);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*nrhs + 31) / 32;
+    nbx = f2cmax(i__1,i__2);
+
+/*     Compute the workspace */
+
+/*     The workspace comprises two parts. */
+/*     The first part stores the local scale factors. Each simultaneously */
+/*     computed right-hand side requires one local scale factor per block */
+/*     row. WORK( I+KK*LDS ) is the scale factor of the vector */
+/*     segment associated with the I-th block row and the KK-th vector */
+/*     in the block column. */
+/* Computing MAX */
+    i__1 = nba, i__2 = f2cmin(*nrhs,32);
+    lscale = nba * f2cmax(i__1,i__2);
+    lds = nba;
+/*     The second part stores upper bounds of the triangular A. There are */
+/*     a total of NBA x NBA blocks, of which only the upper triangular */
+/*     part or the lower triangular part is referenced. The upper bound of */
+/*     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */
+    lanrm = nba * nba;
+    awrk = lscale;
+    work[1] = (doublereal) (lscale + lanrm);
+
+/*     Test the input parameters */
+
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T") && ! 
+	    lsame_(trans, "C")) {
+	*info = -2;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -3;
+    } else if (! lsame_(normin, "Y") && ! lsame_(normin,
+	     "N")) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*nrhs < 0) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*ldx < f2cmax(1,*n)) {
+	*info = -10;
+    } else if (! lquery && (doublereal) (*lwork) < work[1]) {
+	*info = -14;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DLATRS3", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Initialize scaling factors */
+
+    i__1 = *nrhs;
+    for (kk = 1; kk <= i__1; ++kk) {
+	scale[kk] = 1.;
+    }
+
+/*     Quick return if possible */
+
+    if (f2cmin(*n,*nrhs) == 0) {
+	return 0;
+    }
+
+/*     Determine machine dependent constant to control overflow. */
+
+    bignum = dlamch_("Overflow");
+    smlnum = dlamch_("Safe Minimum");
+
+/*     Use unblocked code for small problems */
+
+    if (*nrhs < 2) {
+	dlatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + 
+		1], &scale[1], &cnorm[1], info);
+	i__1 = *nrhs;
+	for (k = 2; k <= i__1; ++k) {
+	    dlatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * 
+		    x_dim1 + 1], &scale[k], &cnorm[1], info);
+	}
+	return 0;
+    }
+
+/*     Compute norms of blocks of A excluding diagonal blocks and find */
+/*     the block with the largest norm TMAX. */
+
+    tmax = 0.;
+    i__1 = nba;
+    for (j = 1; j <= i__1; ++j) {
+	j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+	i__2 = j * nb;
+	j2 = f2cmin(i__2,*n) + 1;
+	if (upper) {
+	    ifirst = 1;
+	    ilast = j - 1;
+	} else {
+	    ifirst = j + 1;
+	    ilast = nba;
+	}
+	i__2 = ilast;
+	for (i__ = ifirst; i__ <= i__2; ++i__) {
+	    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+	    i__3 = i__ * nb;
+	    i2 = f2cmin(i__3,*n) + 1;
+
+/*           Compute upper bound of A( I1:I2-1, J1:J2-1 ). */
+
+	    if (notran) {
+		i__3 = i2 - i1;
+		i__4 = j2 - j1;
+		anrm = dlange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, 
+			w);
+		work[awrk + i__ + (j - 1) * nba] = anrm;
+	    } else {
+		i__3 = i2 - i1;
+		i__4 = j2 - j1;
+		anrm = dlange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, 
+			w);
+		work[awrk + j + (i__ - 1) * nba] = anrm;
+	    }
+	    tmax = f2cmax(tmax,anrm);
+	}
+    }
+
+    if (! (tmax <= dlamch_("Overflow"))) {
+
+/*        Some matrix entries have huge absolute value. At least one upper */
+/*        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */
+/*        number, either due to overflow in LANGE or due to Inf in A. */
+/*        Fall back to LATRS. Set normin = 'N' for every right-hand side to */
+/*        force computation of TSCAL in LATRS to avoid the likely overflow */
+/*        in the computation of the column norms CNORM. */
+
+	i__1 = *nrhs;
+	for (k = 1; k <= i__1; ++k) {
+	    dlatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * 
+		    x_dim1 + 1], &scale[k], &cnorm[1], info);
+	}
+	return 0;
+    }
+
+/*     Every right-hand side requires workspace to store NBA local scale */
+/*     factors. To save workspace, X is computed successively in block columns */
+/*     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */
+/*     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */
+    i__1 = nbx;
+    for (k = 1; k <= i__1; ++k) {
+/*        Loop over block columns (index = K) of X and, for column-wise scalings, */
+/*        over individual columns (index = KK). */
+/*        K1: column index of the first column in X( J, K ) */
+/*        K2: column index of the first column in X( J, K+1 ) */
+/*        so the K2 - K1 is the column count of the block X( J, K ) */
+	k1 = (k - 1 << 5) + 1;
+/* Computing MIN */
+	i__2 = k << 5;
+	k2 = f2cmin(i__2,*nrhs) + 1;
+
+/*        Initialize local scaling factors of current block column X( J, K ) */
+
+	i__2 = k2 - k1;
+	for (kk = 1; kk <= i__2; ++kk) {
+	    i__3 = nba;
+	    for (i__ = 1; i__ <= i__3; ++i__) {
+		work[i__ + kk * lds] = 1.;
+	    }
+	}
+
+	if (notran) {
+
+/*           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */
+
+	    if (upper) {
+		jfirst = nba;
+		jlast = 1;
+		jinc = -1;
+	    } else {
+		jfirst = 1;
+		jlast = nba;
+		jinc = 1;
+	    }
+	} else {
+
+/*           Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */
+
+	    if (upper) {
+		jfirst = 1;
+		jlast = nba;
+		jinc = 1;
+	    } else {
+		jfirst = nba;
+		jlast = 1;
+		jinc = -1;
+	    }
+	}
+
+	i__2 = jlast;
+	i__3 = jinc;
+	for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) {
+/*           J1: row index of the first row in A( J, J ) */
+/*           J2: row index of the first row in A( J+1, J+1 ) */
+/*           so that J2 - J1 is the row count of the block A( J, J ) */
+	    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+	    i__4 = j * nb;
+	    j2 = f2cmin(i__4,*n) + 1;
+
+/*           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */
+/*           for all right-hand sides in the current block column, */
+/*           one RHS at a time. */
+
+	    i__4 = k2 - k1;
+	    for (kk = 1; kk <= i__4; ++kk) {
+		rhs = k1 + kk - 1;
+		if (kk == 1) {
+		    i__5 = j2 - j1;
+		    dlatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * 
+			    a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, &
+			    cnorm[1], info);
+		} else {
+		    i__5 = j2 - j1;
+		    dlatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * 
+			    a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, &
+			    cnorm[1], info);
+		}
+/*              Find largest absolute value entry in the vector segment */
+/*              X( J1:J2-1, RHS ) as an upper bound for the worst case */
+/*              growth in the linear updates. */
+		i__5 = j2 - j1;
+		xnrm[kk - 1] = dlange_("I", &i__5, &c__1, &x[j1 + rhs * 
+			x_dim1], ldx, w);
+
+		if (scaloc == 0.) {
+/*                 LATRS found that A is singular through A(j,j) = 0. */
+/*                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */
+/*                 and compute A*x = 0 (or A**T*x = 0). Note that */
+/*                 X(J1:J2-1, KK) is set by LATRS. */
+		    scale[rhs] = 0.;
+		    i__5 = j1 - 1;
+		    for (ii = 1; ii <= i__5; ++ii) {
+			x[ii + kk * x_dim1] = 0.;
+		    }
+		    i__5 = *n;
+		    for (ii = j2; ii <= i__5; ++ii) {
+			x[ii + kk * x_dim1] = 0.;
+		    }
+/*                 Discard the local scale factors. */
+		    i__5 = nba;
+		    for (ii = 1; ii <= i__5; ++ii) {
+			work[ii + kk * lds] = 1.;
+		    }
+		    scaloc = 1.;
+		} else if (scaloc * work[j + kk * lds] == 0.) {
+/*                 LATRS computed a valid scale factor, but combined with */
+/*                 the current scaling the solution does not have a */
+/*                 scale factor > 0. */
+
+/*                 Set WORK( J+KK*LDS ) to smallest valid scale */
+/*                 factor and increase SCALOC accordingly. */
+		    scal = work[j + kk * lds] / smlnum;
+		    scaloc *= scal;
+		    work[j + kk * lds] = smlnum;
+/*                 If LATRS overestimated the growth, x may be */
+/*                 rescaled to preserve a valid combined scale */
+/*                 factor WORK( J, KK ) > 0. */
+		    rscal = 1. / scaloc;
+		    if (xnrm[kk - 1] * rscal <= bignum) {
+			xnrm[kk - 1] *= rscal;
+			i__5 = j2 - j1;
+			dscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1);
+			scaloc = 1.;
+		    } else {
+/*                    The system op(A) * x = b is badly scaled and its */
+/*                    solution cannot be represented as (1/scale) * x. */
+/*                    Set x to zero. This approach deviates from LATRS */
+/*                    where a completely meaningless non-zero vector */
+/*                    is returned that is not a solution to op(A) * x = b. */
+			scale[rhs] = 0.;
+			i__5 = *n;
+			for (ii = 1; ii <= i__5; ++ii) {
+			    x[ii + kk * x_dim1] = 0.;
+			}
+/*                    Discard the local scale factors. */
+			i__5 = nba;
+			for (ii = 1; ii <= i__5; ++ii) {
+			    work[ii + kk * lds] = 1.;
+			}
+			scaloc = 1.;
+		    }
+		}
+		scaloc *= work[j + kk * lds];
+		work[j + kk * lds] = scaloc;
+	    }
+
+/*           Linear block updates */
+
+	    if (notran) {
+		if (upper) {
+		    ifirst = j - 1;
+		    ilast = 1;
+		    iinc = -1;
+		} else {
+		    ifirst = j + 1;
+		    ilast = nba;
+		    iinc = 1;
+		}
+	    } else {
+		if (upper) {
+		    ifirst = j + 1;
+		    ilast = nba;
+		    iinc = 1;
+		} else {
+		    ifirst = j - 1;
+		    ilast = 1;
+		    iinc = -1;
+		}
+	    }
+
+	    i__4 = ilast;
+	    i__5 = iinc;
+	    for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += 
+		    i__5) {
+/*              I1: row index of the first column in X( I, K ) */
+/*              I2: row index of the first column in X( I+1, K ) */
+/*              so the I2 - I1 is the row count of the block X( I, K ) */
+		i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		i__6 = i__ * nb;
+		i2 = f2cmin(i__6,*n) + 1;
+
+/*              Prepare the linear update to be executed with GEMM. */
+/*              For each column, compute a consistent scaling, a */
+/*              scaling factor to survive the linear update, and */
+/*              rescale the column segments, if necesssary. Then */
+/*              the linear update is safely executed. */
+
+		i__6 = k2 - k1;
+		for (kk = 1; kk <= i__6; ++kk) {
+		    rhs = k1 + kk - 1;
+/*                 Compute consistent scaling */
+/* Computing MIN */
+		    d__1 = work[i__ + kk * lds], d__2 = work[j + kk * lds];
+		    scamin = f2cmin(d__1,d__2);
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__7 = i2 - i1;
+		    bnrm = dlange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], 
+			    ldx, w);
+		    bnrm *= scamin / work[i__ + kk * lds];
+		    xnrm[kk - 1] *= scamin / work[j + kk * lds];
+		    anrm = work[awrk + i__ + (j - 1) * nba];
+		    scaloc = dlarmm_(&anrm, &xnrm[kk - 1], &bnrm);
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to B( I, KK ) and B( J, KK ). */
+
+		    scal = scamin / work[i__ + kk * lds] * scaloc;
+		    if (scal != 1.) {
+			i__7 = i2 - i1;
+			dscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1);
+			work[i__ + kk * lds] = scamin * scaloc;
+		    }
+
+		    scal = scamin / work[j + kk * lds] * scaloc;
+		    if (scal != 1.) {
+			i__7 = j2 - j1;
+			dscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1);
+			work[j + kk * lds] = scamin * scaloc;
+		    }
+		}
+
+		if (notran) {
+
+/*                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */
+
+		    i__6 = i2 - i1;
+		    i__7 = k2 - k1;
+		    i__8 = j2 - j1;
+		    dgemm_("N", "N", &i__6, &i__7, &i__8, &c_b35, &a[i1 + j1 *
+			     a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, 
+			    &x[i1 + k1 * x_dim1], ldx);
+		} else {
+
+/*                 B( I, K ) := B( I, K ) - A( J, I )**T * X( J, K ) */
+
+		    i__6 = i2 - i1;
+		    i__7 = k2 - k1;
+		    i__8 = j2 - j1;
+		    dgemm_("T", "N", &i__6, &i__7, &i__8, &c_b35, &a[j1 + i1 *
+			     a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, 
+			    &x[i1 + k1 * x_dim1], ldx);
+		}
+	    }
+	}
+
+/*        Reduce local scaling factors */
+
+	i__3 = k2 - k1;
+	for (kk = 1; kk <= i__3; ++kk) {
+	    rhs = k1 + kk - 1;
+	    i__2 = nba;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MIN */
+		d__1 = scale[rhs], d__2 = work[i__ + kk * lds];
+		scale[rhs] = f2cmin(d__1,d__2);
+	    }
+	}
+
+/*        Realize consistent scaling */
+
+	i__3 = k2 - k1;
+	for (kk = 1; kk <= i__3; ++kk) {
+	    rhs = k1 + kk - 1;
+	    if (scale[rhs] != 1. && scale[rhs] != 0.) {
+		i__2 = nba;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__5 = i__ * nb;
+		    i2 = f2cmin(i__5,*n) + 1;
+		    scal = scale[rhs] / work[i__ + kk * lds];
+		    if (scal != 1.) {
+			i__5 = i2 - i1;
+			dscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1);
+		    }
+		}
+	    }
+	}
+    }
+    return 0;
+
+/*     End of DLATRS3 */
+
+} /* dlatrs3_ */
+
diff --git a/lapack-netlib/SRC/dtrsyl3.c b/lapack-netlib/SRC/dtrsyl3.c
new file mode 100644
index 000000000..d05923a46
--- /dev/null
+++ b/lapack-netlib/SRC/dtrsyl3.c
@@ -0,0 +1,381 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimag(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) ceil(w)
+#define myhuge_(w) HUGE_VAL
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
diff --git a/lapack-netlib/SRC/slarmm.c b/lapack-netlib/SRC/slarmm.c
new file mode 100644
index 000000000..44d6e88d9
--- /dev/null
+++ b/lapack-netlib/SRC/slarmm.c
@@ -0,0 +1,478 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimag(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) ceil(w)
+#define myhuge_(w) HUGE_VAL
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* > \brief \b SLARMM */
+
+/* Definition: */
+/* =========== */
+
+/*      REAL FUNCTION SLARMM( ANORM, BNORM, CNORM ) */
+
+/*      REAL               ANORM, BNORM, CNORM */
+
+/* >  \par Purpose: */
+/*  ======= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > SLARMM returns a factor s in (0, 1] such that the linear updates */
+/* > */
+/* >    (s * C) - A * (s * B)  and  (s * C) - (s * A) * B */
+/* > */
+/* > cannot overflow, where A, B, and C are matrices of conforming */
+/* > dimensions. */
+/* > */
+/* > This is an auxiliary routine so there is no argument checking. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========= */
+
+/* > \param[in] ANORM */
+/* > \verbatim */
+/* >          ANORM is REAL */
+/* >          The infinity norm of A. ANORM >= 0. */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] BNORM */
+/* > \verbatim */
+/* >          BNORM is REAL */
+/* >          The infinity norm of B. BNORM >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] CNORM */
+/* > \verbatim */
+/* >          CNORM is REAL */
+/* >          The infinity norm of C. CNORM >= 0. */
+/* > \endverbatim */
+/* > */
+/* > */
+/*  ===================================================================== */
+/* >  References: */
+/* >    C. C. Kjelgaard Mikkelsen and L. Karlsson, Blocked Algorithms for */
+/* >    Robust Solution of Triangular Linear Systems. In: International */
+/* >    Conference on Parallel Processing and Applied Mathematics, pages */
+/* >    68--78. Springer, 2017. */
+/* > */
+/* > \ingroup OTHERauxiliary */
+/*  ===================================================================== */
+real slarmm_(real *anorm, real *bnorm, real *cnorm)
+{
+    /* System generated locals */
+    real ret_val;
+
+    /* Local variables */
+    extern real slamch_(char *);
+    real bignum, smlnum;
+
+
+
+/*     Determine machine dependent parameters to control overflow. */
+
+    smlnum = slamch_("Safe minimum") / slamch_("Precision");
+    bignum = 1.f / smlnum / 4.f;
+
+/*     Compute a scale factor. */
+
+    ret_val = 1.f;
+    if (*bnorm <= 1.f) {
+	if (*anorm * *bnorm > bignum - *cnorm) {
+	    ret_val = .5f;
+	}
+    } else {
+	if (*anorm > (bignum - *cnorm) / *bnorm) {
+	    ret_val = .5f / *bnorm;
+	}
+    }
+    return ret_val;
+
+/*     ==== End of SLARMM ==== */
+
+} /* slarmm_ */
+
diff --git a/lapack-netlib/SRC/slatrs3.c b/lapack-netlib/SRC/slatrs3.c
new file mode 100644
index 000000000..2d8c0ab33
--- /dev/null
+++ b/lapack-netlib/SRC/slatrs3.c
@@ -0,0 +1,1135 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimag(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) ceil(w)
+#define myhuge_(w) HUGE_VAL
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static real c_b35 = -1.f;
+static real c_b36 = 1.f;
+
+/* > \brief \b SLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
+ */
+
+/*  Definition: */
+/*  =========== */
+
+/*      SUBROUTINE SLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */
+/*                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */
+
+/*       CHARACTER          DIAG, NORMIN, TRANS, UPLO */
+/*       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS */
+/*       REAL               A( LDA, * ), CNORM( * ), SCALE( * ), */
+/*                          WORK( * ), X( LDX, * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > SLATRS3 solves one of the triangular systems */
+/* > */
+/* >    A * X = B * diag(scale)  or  A**T * X = B * diag(scale) */
+/* > */
+/* > with scaling to prevent overflow.  Here A is an upper or lower */
+/* > triangular matrix, A**T denotes the transpose of A. X and B are */
+/* > n by nrhs matrices and scale is an nrhs element vector of scaling */
+/* > factors. A scaling factor scale(j) is usually less than or equal */
+/* > to 1, chosen such that X(:,j) is less than the overflow threshold. */
+/* > If the matrix A is singular (A(j,j) = 0 for some j), then */
+/* > a non-trivial solution to A*X = 0 is returned. If the system is */
+/* > so badly scaled that the solution cannot be represented as */
+/* > (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */
+/* > */
+/* > This is a BLAS-3 version of LATRS for solving several right */
+/* > hand sides simultaneously. */
+/* > */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] UPLO */
+/* > \verbatim */
+/* >          UPLO is CHARACTER*1 */
+/* >          Specifies whether the matrix A is upper or lower triangular. */
+/* >          = 'U':  Upper triangular */
+/* >          = 'L':  Lower triangular */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TRANS */
+/* > \verbatim */
+/* >          TRANS is CHARACTER*1 */
+/* >          Specifies the operation applied to A. */
+/* >          = 'N':  Solve A * x = s*b  (No transpose) */
+/* >          = 'T':  Solve A**T* x = s*b  (Transpose) */
+/* >          = 'C':  Solve A**T* x = s*b  (Conjugate transpose = Transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] DIAG */
+/* > \verbatim */
+/* >          DIAG is CHARACTER*1 */
+/* >          Specifies whether or not the matrix A is unit triangular. */
+/* >          = 'N':  Non-unit triangular */
+/* >          = 'U':  Unit triangular */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NORMIN */
+/* > \verbatim */
+/* >          NORMIN is CHARACTER*1 */
+/* >          Specifies whether CNORM has been set or not. */
+/* >          = 'Y':  CNORM contains the column norms on entry */
+/* >          = 'N':  CNORM is not set on entry.  On exit, the norms will */
+/* >                  be computed and stored in CNORM. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The order of the matrix A.  N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of X.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is REAL array, dimension (LDA,N) */
+/* >          The triangular matrix A.  If UPLO = 'U', the leading n by n */
+/* >          upper triangular part of the array A contains the upper */
+/* >          triangular matrix, and the strictly lower triangular part of */
+/* >          A is not referenced.  If UPLO = 'L', the leading n by n lower */
+/* >          triangular part of the array A contains the lower triangular */
+/* >          matrix, and the strictly upper triangular part of A is not */
+/* >          referenced.  If DIAG = 'U', the diagonal elements of A are */
+/* >          also not referenced and are assumed to be 1. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= f2cmax (1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is REAL array, dimension (LDX,NRHS) */
+/* >          On entry, the right hand side B of the triangular system. */
+/* >          On exit, X is overwritten by the solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDX */
+/* > \verbatim */
+/* >          LDX is INTEGER */
+/* >          The leading dimension of the array X.  LDX >= f2cmax (1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SCALE */
+/* > \verbatim */
+/* >          SCALE is REAL array, dimension (NRHS) */
+/* >          The scaling factor s(k) is for the triangular system */
+/* >          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k). */
+/* >          If SCALE = 0, the matrix A is singular or badly scaled. */
+/* >          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */
+/* >          that is an exact or approximate solution to A*x(:,k) = 0 */
+/* >          is returned. If the system so badly scaled that solution */
+/* >          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */
+/* >          is returned. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CNORM */
+/* > \verbatim */
+/* >          CNORM is REAL array, dimension (N) */
+/* > */
+/* >          If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */
+/* >          contains the norm of the off-diagonal part of the j-th column */
+/* >          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal */
+/* >          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */
+/* >          must be greater than or equal to the 1-norm. */
+/* > */
+/* >          If NORMIN = 'N', CNORM is an output argument and CNORM(j) */
+/* >          returns the 1-norm of the offdiagonal part of the j-th column */
+/* >          of A. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is REAL array, dimension (LWORK). */
+/* >          On exit, if INFO = 0, WORK(1) returns the optimal size of */
+/* >          WORK. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* >          LWORK is INTEGER */
+/* >          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */
+/* >          NBA = (N + NB - 1)/NB and NB is the optimal block size. */
+/* > */
+/* >          If LWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal dimensions of the WORK array, returns */
+/* >          this value as the first entry of the WORK array, and no error */
+/* >          message related to LWORK is issued by XERBLA. */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0:  successful exit */
+/* >          < 0:  if INFO = -k, the k-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup doubleOTHERauxiliary */
+/* > \par Further Details: */
+/*  ===================== */
+/*  \verbatim */
+/*  The algorithm follows the structure of a block triangular solve. */
+/*  The diagonal block is solved with a call to the robust the triangular */
+/*  solver LATRS for every right-hand side RHS = 1, ..., NRHS */
+/*     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */
+/*  where op( A ) = A or op( A ) = A**T. */
+/*  The linear block updates operate on block columns of X, */
+/*     B( I, K ) - op(A( I, J )) * X( J, K ) */
+/*  and use GEMM. To avoid overflow in the linear block update, the worst case */
+/*  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */
+/*  such that */
+/*     || s * B( I, RHS )||_oo */
+/*   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold */
+
+/*  Once all columns of a block column have been rescaled (BLAS-1), the linear */
+/*  update is executed with GEMM without overflow. */
+
+/*  To limit rescaling, local scale factors track the scaling of column segments. */
+/*  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */
+/*  per right-hand side column RHS = 1, ..., NRHS. The global scale factor */
+/*  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */
+/*  I = 1, ..., NBA. */
+/*  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */
+/*  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */
+/*  linear update of potentially inconsistently scaled vector segments */
+/*     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */
+/*  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */
+/*  if necessary, rescales the blocks prior to calling GEMM. */
+
+/*  \endverbatim */
+/*  ===================================================================== */
+/*  References: */
+/*  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */
+/*  Parallel robust solution of triangular linear systems. Concurrency */
+/*  and Computation: Practice and Experience, 31(19), e5064. */
+
+/*  Contributor: */
+/*   Angelika Schwarz, Umea University, Sweden. */
+
+/*  ===================================================================== */
+/* Subroutine */ int slatrs3_(char *uplo, char *trans, char *diag, char *
+	normin, integer *n, integer *nrhs, real *a, integer *lda, real *x, 
+	integer *ldx, real *scale, real *cnorm, real *work, integer *lwork, 
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, 
+	    i__6, i__7, i__8;
+    real r__1, r__2;
+
+    /* Local variables */
+    integer iinc, jinc;
+    real scal, anrm, bnrm;
+    integer awrk;
+    real tmax, xnrm[32];
+    integer i__, j, k;
+    real w[64];
+    extern logical lsame_(char *, char *);
+    real rscal;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), 
+	    sgemm_(char *, char *, integer *, integer *, integer *, real *, 
+	    real *, integer *, real *, integer *, real *, real *, integer *);
+    integer lanrm, ilast, jlast, i1;
+    logical upper;
+    integer i2, j1, j2, k1, k2, nb, ii, kk, lscale;
+    real scaloc;
+    extern real slamch_(char *), slange_(char *, integer *, integer *,
+	     real *, integer *, real *);
+    real scamin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    real bignum;
+    extern real slarmm_(real *, real *, real *);
+    integer ifirst;
+    logical notran;
+    integer jfirst;
+    extern /* Subroutine */ int slatrs_(char *, char *, char *, char *, 
+	    integer *, real *, integer *, real *, real *, real *, integer *);
+    real smlnum;
+    logical nounit, lquery;
+    integer nba, lds, nbx, rhs;
+
+
+
+/*  ===================================================================== */
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    --scale;
+    --cnorm;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    notran = lsame_(trans, "N");
+    nounit = lsame_(diag, "N");
+    lquery = *lwork == -1;
+
+/*     Partition A and X into blocks. */
+
+/* Computing MAX */
+    i__1 = 8, i__2 = ilaenv_(&c__1, "SLATRS", "", n, n, &c_n1, &c_n1, (ftnlen)
+	    6, (ftnlen)0);
+    nb = f2cmax(i__1,i__2);
+    nb = f2cmin(64,nb);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*n + nb - 1) / nb;
+    nba = f2cmax(i__1,i__2);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*nrhs + 31) / 32;
+    nbx = f2cmax(i__1,i__2);
+
+/*     Compute the workspace */
+
+/*     The workspace comprises two parts. */
+/*     The first part stores the local scale factors. Each simultaneously */
+/*     computed right-hand side requires one local scale factor per block */
+/*     row. WORK( I + KK * LDS ) is the scale factor of the vector */
+/*     segment associated with the I-th block row and the KK-th vector */
+/*     in the block column. */
+/* Computing MAX */
+    i__1 = nba, i__2 = f2cmin(*nrhs,32);
+    lscale = nba * f2cmax(i__1,i__2);
+    lds = nba;
+/*     The second part stores upper bounds of the triangular A. There are */
+/*     a total of NBA x NBA blocks, of which only the upper triangular */
+/*     part or the lower triangular part is referenced. The upper bound of */
+/*     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */
+    lanrm = nba * nba;
+    awrk = lscale;
+    work[1] = (real) (lscale + lanrm);
+
+/*     Test the input parameters. */
+
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T") && ! 
+	    lsame_(trans, "C")) {
+	*info = -2;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -3;
+    } else if (! lsame_(normin, "Y") && ! lsame_(normin,
+	     "N")) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*nrhs < 0) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*ldx < f2cmax(1,*n)) {
+	*info = -10;
+    } else if (! lquery && (real) (*lwork) < work[1]) {
+	*info = -14;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLATRS3", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Initialize scaling factors */
+
+    i__1 = *nrhs;
+    for (kk = 1; kk <= i__1; ++kk) {
+	scale[kk] = 1.f;
+    }
+
+/*     Quick return if possible */
+
+    if (f2cmin(*n,*nrhs) == 0) {
+	return 0;
+    }
+
+/*     Determine machine dependent constant to control overflow. */
+
+    bignum = slamch_("Overflow");
+    smlnum = slamch_("Safe Minimum");
+
+/*     Use unblocked code for small problems */
+
+    if (*nrhs < 2) {
+	slatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + 
+		1], &scale[1], &cnorm[1], info);
+	i__1 = *nrhs;
+	for (k = 2; k <= i__1; ++k) {
+	    slatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * 
+		    x_dim1 + 1], &scale[k], &cnorm[1], info);
+	}
+	return 0;
+    }
+
+/*     Compute norms of blocks of A excluding diagonal blocks and find */
+/*     the block with the largest norm TMAX. */
+
+    tmax = 0.f;
+    i__1 = nba;
+    for (j = 1; j <= i__1; ++j) {
+	j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+	i__2 = j * nb;
+	j2 = f2cmin(i__2,*n) + 1;
+	if (upper) {
+	    ifirst = 1;
+	    ilast = j - 1;
+	} else {
+	    ifirst = j + 1;
+	    ilast = nba;
+	}
+	i__2 = ilast;
+	for (i__ = ifirst; i__ <= i__2; ++i__) {
+	    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+	    i__3 = i__ * nb;
+	    i2 = f2cmin(i__3,*n) + 1;
+
+/*           Compute upper bound of A( I1:I2-1, J1:J2-1 ). */
+
+	    if (notran) {
+		i__3 = i2 - i1;
+		i__4 = j2 - j1;
+		anrm = slange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, 
+			w);
+		work[awrk + i__ + (j - 1) * nba] = anrm;
+	    } else {
+		i__3 = i2 - i1;
+		i__4 = j2 - j1;
+		anrm = slange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, 
+			w);
+		work[awrk + j + (i__ - 1) * nba] = anrm;
+	    }
+	    tmax = f2cmax(tmax,anrm);
+	}
+    }
+
+    if (! (tmax <= slamch_("Overflow"))) {
+
+/*        Some matrix entries have huge absolute value. At least one upper */
+/*        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */
+/*        number, either due to overflow in LANGE or due to Inf in A. */
+/*        Fall back to LATRS. Set normin = 'N' for every right-hand side to */
+/*        force computation of TSCAL in LATRS to avoid the likely overflow */
+/*        in the computation of the column norms CNORM. */
+
+	i__1 = *nrhs;
+	for (k = 1; k <= i__1; ++k) {
+	    slatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * 
+		    x_dim1 + 1], &scale[k], &cnorm[1], info);
+	}
+	return 0;
+    }
+
+/*     Every right-hand side requires workspace to store NBA local scale */
+/*     factors. To save workspace, X is computed successively in block columns */
+/*     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */
+/*     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */
+    i__1 = nbx;
+    for (k = 1; k <= i__1; ++k) {
+/*        Loop over block columns (index = K) of X and, for column-wise scalings, */
+/*        over individual columns (index = KK). */
+/*        K1: column index of the first column in X( J, K ) */
+/*        K2: column index of the first column in X( J, K+1 ) */
+/*        so the K2 - K1 is the column count of the block X( J, K ) */
+	k1 = (k - 1 << 5) + 1;
+/* Computing MIN */
+	i__2 = k << 5;
+	k2 = f2cmin(i__2,*nrhs) + 1;
+
+/*        Initialize local scaling factors of current block column X( J, K ) */
+
+	i__2 = k2 - k1;
+	for (kk = 1; kk <= i__2; ++kk) {
+	    i__3 = nba;
+	    for (i__ = 1; i__ <= i__3; ++i__) {
+		work[i__ + kk * lds] = 1.f;
+	    }
+	}
+
+	if (notran) {
+
+/*           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */
+
+	    if (upper) {
+		jfirst = nba;
+		jlast = 1;
+		jinc = -1;
+	    } else {
+		jfirst = 1;
+		jlast = nba;
+		jinc = 1;
+	    }
+	} else {
+
+/*           Solve A**T * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */
+
+	    if (upper) {
+		jfirst = 1;
+		jlast = nba;
+		jinc = 1;
+	    } else {
+		jfirst = nba;
+		jlast = 1;
+		jinc = -1;
+	    }
+	}
+
+	i__2 = jlast;
+	i__3 = jinc;
+	for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) {
+/*           J1: row index of the first row in A( J, J ) */
+/*           J2: row index of the first row in A( J+1, J+1 ) */
+/*           so that J2 - J1 is the row count of the block A( J, J ) */
+	    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+	    i__4 = j * nb;
+	    j2 = f2cmin(i__4,*n) + 1;
+
+/*           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */
+/*           for all right-hand sides in the current block column, */
+/*           one RHS at a time. */
+
+	    i__4 = k2 - k1;
+	    for (kk = 1; kk <= i__4; ++kk) {
+		rhs = k1 + kk - 1;
+		if (kk == 1) {
+		    i__5 = j2 - j1;
+		    slatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * 
+			    a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, &
+			    cnorm[1], info);
+		} else {
+		    i__5 = j2 - j1;
+		    slatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * 
+			    a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, &
+			    cnorm[1], info);
+		}
+/*              Find largest absolute value entry in the vector segment */
+/*              X( J1:J2-1, RHS ) as an upper bound for the worst case */
+/*              growth in the linear updates. */
+		i__5 = j2 - j1;
+		xnrm[kk - 1] = slange_("I", &i__5, &c__1, &x[j1 + rhs * 
+			x_dim1], ldx, w);
+
+		if (scaloc == 0.f) {
+/*                 LATRS found that A is singular through A(j,j) = 0. */
+/*                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */
+/*                 and compute A*x = 0 (or A**T*x = 0). Note that */
+/*                 X(J1:J2-1, KK) is set by LATRS. */
+		    scale[rhs] = 0.f;
+		    i__5 = j1 - 1;
+		    for (ii = 1; ii <= i__5; ++ii) {
+			x[ii + kk * x_dim1] = 0.f;
+		    }
+		    i__5 = *n;
+		    for (ii = j2; ii <= i__5; ++ii) {
+			x[ii + kk * x_dim1] = 0.f;
+		    }
+/*                 Discard the local scale factors. */
+		    i__5 = nba;
+		    for (ii = 1; ii <= i__5; ++ii) {
+			work[ii + kk * lds] = 1.f;
+		    }
+		    scaloc = 1.f;
+		} else if (scaloc * work[j + kk * lds] == 0.f) {
+/*                 LATRS computed a valid scale factor, but combined with */
+/*                 the current scaling the solution does not have a */
+/*                 scale factor > 0. */
+
+/*                 Set WORK( J+KK*LDS ) to smallest valid scale */
+/*                 factor and increase SCALOC accordingly. */
+		    scal = work[j + kk * lds] / smlnum;
+		    scaloc *= scal;
+		    work[j + kk * lds] = smlnum;
+/*                 If LATRS overestimated the growth, x may be */
+/*                 rescaled to preserve a valid combined scale */
+/*                 factor WORK( J, KK ) > 0. */
+		    rscal = 1.f / scaloc;
+		    if (xnrm[kk - 1] * rscal <= bignum) {
+			xnrm[kk - 1] *= rscal;
+			i__5 = j2 - j1;
+			sscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1);
+			scaloc = 1.f;
+		    } else {
+/*                    The system op(A) * x = b is badly scaled and its */
+/*                    solution cannot be represented as (1/scale) * x. */
+/*                    Set x to zero. This approach deviates from LATRS */
+/*                    where a completely meaningless non-zero vector */
+/*                    is returned that is not a solution to op(A) * x = b. */
+			scale[rhs] = 0.f;
+			i__5 = *n;
+			for (ii = 1; ii <= i__5; ++ii) {
+			    x[ii + kk * x_dim1] = 0.f;
+			}
+/*                    Discard the local scale factors. */
+			i__5 = nba;
+			for (ii = 1; ii <= i__5; ++ii) {
+			    work[ii + kk * lds] = 1.f;
+			}
+			scaloc = 1.f;
+		    }
+		}
+		scaloc *= work[j + kk * lds];
+		work[j + kk * lds] = scaloc;
+	    }
+
+/*           Linear block updates */
+
+	    if (notran) {
+		if (upper) {
+		    ifirst = j - 1;
+		    ilast = 1;
+		    iinc = -1;
+		} else {
+		    ifirst = j + 1;
+		    ilast = nba;
+		    iinc = 1;
+		}
+	    } else {
+		if (upper) {
+		    ifirst = j + 1;
+		    ilast = nba;
+		    iinc = 1;
+		} else {
+		    ifirst = j - 1;
+		    ilast = 1;
+		    iinc = -1;
+		}
+	    }
+
+	    i__4 = ilast;
+	    i__5 = iinc;
+	    for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += 
+		    i__5) {
+/*              I1: row index of the first column in X( I, K ) */
+/*              I2: row index of the first column in X( I+1, K ) */
+/*              so the I2 - I1 is the row count of the block X( I, K ) */
+		i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		i__6 = i__ * nb;
+		i2 = f2cmin(i__6,*n) + 1;
+
+/*              Prepare the linear update to be executed with GEMM. */
+/*              For each column, compute a consistent scaling, a */
+/*              scaling factor to survive the linear update, and */
+/*              rescale the column segments, if necesssary. Then */
+/*              the linear update is safely executed. */
+
+		i__6 = k2 - k1;
+		for (kk = 1; kk <= i__6; ++kk) {
+		    rhs = k1 + kk - 1;
+/*                 Compute consistent scaling */
+/* Computing MIN */
+		    r__1 = work[i__ + kk * lds], r__2 = work[j + kk * lds];
+		    scamin = f2cmin(r__1,r__2);
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__7 = i2 - i1;
+		    bnrm = slange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], 
+			    ldx, w);
+		    bnrm *= scamin / work[i__ + kk * lds];
+		    xnrm[kk - 1] *= scamin / work[j + kk * lds];
+		    anrm = work[awrk + i__ + (j - 1) * nba];
+		    scaloc = slarmm_(&anrm, &xnrm[kk - 1], &bnrm);
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to B( I, KK ) and B( J, KK ). */
+
+		    scal = scamin / work[i__ + kk * lds] * scaloc;
+		    if (scal != 1.f) {
+			i__7 = i2 - i1;
+			sscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1);
+			work[i__ + kk * lds] = scamin * scaloc;
+		    }
+
+		    scal = scamin / work[j + kk * lds] * scaloc;
+		    if (scal != 1.f) {
+			i__7 = j2 - j1;
+			sscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1);
+			work[j + kk * lds] = scamin * scaloc;
+		    }
+		}
+
+		if (notran) {
+
+/*                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */
+
+		    i__6 = i2 - i1;
+		    i__7 = k2 - k1;
+		    i__8 = j2 - j1;
+		    sgemm_("N", "N", &i__6, &i__7, &i__8, &c_b35, &a[i1 + j1 *
+			     a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, 
+			    &x[i1 + k1 * x_dim1], ldx);
+		} else {
+
+/*                 B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */
+
+		    i__6 = i2 - i1;
+		    i__7 = k2 - k1;
+		    i__8 = j2 - j1;
+		    sgemm_("T", "N", &i__6, &i__7, &i__8, &c_b35, &a[j1 + i1 *
+			     a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b36, 
+			    &x[i1 + k1 * x_dim1], ldx);
+		}
+	    }
+	}
+
+/*        Reduce local scaling factors */
+
+	i__3 = k2 - k1;
+	for (kk = 1; kk <= i__3; ++kk) {
+	    rhs = k1 + kk - 1;
+	    i__2 = nba;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MIN */
+		r__1 = scale[rhs], r__2 = work[i__ + kk * lds];
+		scale[rhs] = f2cmin(r__1,r__2);
+	    }
+	}
+
+/*        Realize consistent scaling */
+
+	i__3 = k2 - k1;
+	for (kk = 1; kk <= i__3; ++kk) {
+	    rhs = k1 + kk - 1;
+	    if (scale[rhs] != 1.f && scale[rhs] != 0.f) {
+		i__2 = nba;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__5 = i__ * nb;
+		    i2 = f2cmin(i__5,*n) + 1;
+		    scal = scale[rhs] / work[i__ + kk * lds];
+		    if (scal != 1.f) {
+			i__5 = i2 - i1;
+			sscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1);
+		    }
+		}
+	    }
+	}
+    }
+    return 0;
+
+/*     End of SLATRS3 */
+
+} /* slatrs3_ */
+
diff --git a/lapack-netlib/SRC/strsyl3.c b/lapack-netlib/SRC/strsyl3.c
new file mode 100644
index 000000000..d05923a46
--- /dev/null
+++ b/lapack-netlib/SRC/strsyl3.c
@@ -0,0 +1,381 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimag(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) ceil(w)
+#define myhuge_(w) HUGE_VAL
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
diff --git a/lapack-netlib/SRC/zlatrs3.c b/lapack-netlib/SRC/zlatrs3.c
new file mode 100644
index 000000000..f2d8c9bd7
--- /dev/null
+++ b/lapack-netlib/SRC/zlatrs3.c
@@ -0,0 +1,1157 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimag(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) ceil(w)
+#define myhuge_(w) HUGE_VAL
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+static integer c__1 = 1;
+static integer c_n1 = -1;
+
+/* > \brief \b ZLATRS3 solves a triangular system of equations with the scale factors set to prevent overflow.
+ */
+
+/*  Definition: */
+/*  =========== */
+
+/*      SUBROUTINE ZLATRS3( UPLO, TRANS, DIAG, NORMIN, N, NRHS, A, LDA, */
+/*                          X, LDX, SCALE, CNORM, WORK, LWORK, INFO ) */
+
+/*       CHARACTER          DIAG, NORMIN, TRANS, UPLO */
+/*       INTEGER            INFO, LDA, LWORK, LDX, N, NRHS */
+/*       DOUBLE PRECISION   CNORM( * ), SCALE( * ), WORK( * ) */
+/*       COMPLEX*16         A( LDA, * ), X( LDX, * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > ZLATRS3 solves one of the triangular systems */
+/* > */
+/* >    A * X = B * diag(scale),  A**T * X = B * diag(scale), or */
+/* >    A**H * X = B * diag(scale) */
+/* > */
+/* > with scaling to prevent overflow.  Here A is an upper or lower */
+/* > triangular matrix, A**T denotes the transpose of A, A**H denotes the */
+/* > conjugate transpose of A. X and B are n-by-nrhs matrices and scale */
+/* > is an nrhs-element vector of scaling factors. A scaling factor scale(j) */
+/* > is usually less than or equal to 1, chosen such that X(:,j) is less */
+/* > than the overflow threshold. If the matrix A is singular (A(j,j) = 0 */
+/* > for some j), then a non-trivial solution to A*X = 0 is returned. If */
+/* > the system is so badly scaled that the solution cannot be represented */
+/* > as (1/scale(k))*X(:,k), then x(:,k) = 0 and scale(k) is returned. */
+/* > */
+/* > This is a BLAS-3 version of LATRS for solving several right */
+/* > hand sides simultaneously. */
+/* > */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] UPLO */
+/* > \verbatim */
+/* >          UPLO is CHARACTER*1 */
+/* >          Specifies whether the matrix A is upper or lower triangular. */
+/* >          = 'U':  Upper triangular */
+/* >          = 'L':  Lower triangular */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TRANS */
+/* > \verbatim */
+/* >          TRANS is CHARACTER*1 */
+/* >          Specifies the operation applied to A. */
+/* >          = 'N':  Solve A * x = s*b  (No transpose) */
+/* >          = 'T':  Solve A**T* x = s*b  (Transpose) */
+/* >          = 'C':  Solve A**T* x = s*b  (Conjugate transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] DIAG */
+/* > \verbatim */
+/* >          DIAG is CHARACTER*1 */
+/* >          Specifies whether or not the matrix A is unit triangular. */
+/* >          = 'N':  Non-unit triangular */
+/* >          = 'U':  Unit triangular */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NORMIN */
+/* > \verbatim */
+/* >          NORMIN is CHARACTER*1 */
+/* >          Specifies whether CNORM has been set or not. */
+/* >          = 'Y':  CNORM contains the column norms on entry */
+/* >          = 'N':  CNORM is not set on entry.  On exit, the norms will */
+/* >                  be computed and stored in CNORM. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The order of the matrix A.  N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of X.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 array, dimension (LDA,N) */
+/* >          The triangular matrix A.  If UPLO = 'U', the leading n by n */
+/* >          upper triangular part of the array A contains the upper */
+/* >          triangular matrix, and the strictly lower triangular part of */
+/* >          A is not referenced.  If UPLO = 'L', the leading n by n lower */
+/* >          triangular part of the array A contains the lower triangular */
+/* >          matrix, and the strictly upper triangular part of A is not */
+/* >          referenced.  If DIAG = 'U', the diagonal elements of A are */
+/* >          also not referenced and are assumed to be 1. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= f2cmax (1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is COMPLEX*16 array, dimension (LDX,NRHS) */
+/* >          On entry, the right hand side B of the triangular system. */
+/* >          On exit, X is overwritten by the solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDX */
+/* > \verbatim */
+/* >          LDX is INTEGER */
+/* >          The leading dimension of the array X.  LDX >= f2cmax (1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SCALE */
+/* > \verbatim */
+/* >          SCALE is DOUBLE PRECISION array, dimension (NRHS) */
+/* >          The scaling factor s(k) is for the triangular system */
+/* >          A * x(:,k) = s(k)*b(:,k)  or  A**T* x(:,k) = s(k)*b(:,k). */
+/* >          If SCALE = 0, the matrix A is singular or badly scaled. */
+/* >          If A(j,j) = 0 is encountered, a non-trivial vector x(:,k) */
+/* >          that is an exact or approximate solution to A*x(:,k) = 0 */
+/* >          is returned. If the system so badly scaled that solution */
+/* >          cannot be presented as x(:,k) * 1/s(k), then x(:,k) = 0 */
+/* >          is returned. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] CNORM */
+/* > \verbatim */
+/* >          CNORM is DOUBLE PRECISION array, dimension (N) */
+/* > */
+/* >          If NORMIN = 'Y', CNORM is an input argument and CNORM(j) */
+/* >          contains the norm of the off-diagonal part of the j-th column */
+/* >          of A.  If TRANS = 'N', CNORM(j) must be greater than or equal */
+/* >          to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) */
+/* >          must be greater than or equal to the 1-norm. */
+/* > */
+/* >          If NORMIN = 'N', CNORM is an output argument and CNORM(j) */
+/* >          returns the 1-norm of the offdiagonal part of the j-th column */
+/* >          of A. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is DOUBLE PRECISION array, dimension (LWORK). */
+/* >          On exit, if INFO = 0, WORK(1) returns the optimal size of */
+/* >          WORK. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* >          LWORK is INTEGER */
+/* >          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where */
+/* >          NBA = (N + NB - 1)/NB and NB is the optimal block size. */
+/* > */
+/* >          If LWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal dimensions of the WORK array, returns */
+/* >          this value as the first entry of the WORK array, and no error */
+/* >          message related to LWORK is issued by XERBLA. */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0:  successful exit */
+/* >          < 0:  if INFO = -k, the k-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup doubleOTHERauxiliary */
+/* > \par Further Details: */
+/*  ===================== */
+/*  \verbatim */
+/*  The algorithm follows the structure of a block triangular solve. */
+/*  The diagonal block is solved with a call to the robust the triangular */
+/*  solver LATRS for every right-hand side RHS = 1, ..., NRHS */
+/*     op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ), */
+/*  where op( A ) = A or op( A ) = A**T or op( A ) = A**H. */
+/*  The linear block updates operate on block columns of X, */
+/*     B( I, K ) - op(A( I, J )) * X( J, K ) */
+/*  and use GEMM. To avoid overflow in the linear block update, the worst case */
+/*  growth is estimated. For every RHS, a scale factor s <= 1.0 is computed */
+/*  such that */
+/*     || s * B( I, RHS )||_oo */
+/*   + || op(A( I, J )) ||_oo * || s *  X( J, RHS ) ||_oo <= Overflow threshold */
+
+/*  Once all columns of a block column have been rescaled (BLAS-1), the linear */
+/*  update is executed with GEMM without overflow. */
+
+/*  To limit rescaling, local scale factors track the scaling of column segments. */
+/*  There is one local scale factor s( I, RHS ) per block row I = 1, ..., NBA */
+/*  per right-hand side column RHS = 1, ..., NRHS. The global scale factor */
+/*  SCALE( RHS ) is chosen as the smallest local scale factor s( I, RHS ) */
+/*  I = 1, ..., NBA. */
+/*  A triangular solve op(A( J, J )) * x( J, RHS ) = SCALOC * b( J, RHS ) */
+/*  updates the local scale factor s( J, RHS ) := s( J, RHS ) * SCALOC. The */
+/*  linear update of potentially inconsistently scaled vector segments */
+/*     s( I, RHS ) * b( I, RHS ) - op(A( I, J )) * ( s( J, RHS )* x( J, RHS ) ) */
+/*  computes a consistent scaling SCAMIN = MIN( s(I, RHS ), s(J, RHS) ) and, */
+/*  if necessary, rescales the blocks prior to calling GEMM. */
+
+/*  \endverbatim */
+/*  ===================================================================== */
+/*  References: */
+/*  C. C. Kjelgaard Mikkelsen, A. B. Schwarz and L. Karlsson (2019). */
+/*  Parallel robust solution of triangular linear systems. Concurrency */
+/*  and Computation: Practice and Experience, 31(19), e5064. */
+
+/*  Contributor: */
+/*   Angelika Schwarz, Umea University, Sweden. */
+
+/*  ===================================================================== */
+/* Subroutine */ int zlatrs3_(char *uplo, char *trans, char *diag, char *
+	normin, integer *n, integer *nrhs, doublecomplex *a, integer *lda, 
+	doublecomplex *x, integer *ldx, doublereal *scale, doublereal *cnorm, 
+	doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, x_dim1, x_offset, i__1, i__2, i__3, i__4, i__5, 
+	    i__6, i__7, i__8;
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    integer iinc, jinc;
+    doublereal scal, anrm, bnrm;
+    integer awrk;
+    doublereal tmax, xnrm[32];
+    integer i__, j, k;
+    doublereal w[64];
+    extern logical lsame_(char *, char *);
+    doublereal rscal;
+    integer lanrm, ilast, jlast;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    integer i1;
+    logical upper;
+    integer i2, j1, j2, k1, k2, nb, ii, kk;
+    extern doublereal dlamch_(char *);
+    integer lscale;
+    doublereal scaloc, scamin;
+    extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, 
+	    integer *, doublereal *);
+    doublereal bignum;
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *, 
+	    doublecomplex *, integer *);
+    integer ifirst;
+    logical notran;
+    integer jfirst;
+    doublereal smlnum;
+    logical nounit;
+    extern /* Subroutine */ int zlatrs_(char *, char *, char *, char *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, 
+	    doublereal *, doublereal *, integer *);
+    logical lquery;
+    integer nba, lds, nbx, rhs;
+
+
+
+/*  ===================================================================== */
+
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    --scale;
+    --cnorm;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    upper = lsame_(uplo, "U");
+    notran = lsame_(trans, "N");
+    nounit = lsame_(diag, "N");
+    lquery = *lwork == -1;
+
+/*     Partition A and X into blocks. */
+
+/* Computing MAX */
+    i__1 = 8, i__2 = ilaenv_(&c__1, "ZLATRS", "", n, n, &c_n1, &c_n1, (ftnlen)
+	    6, (ftnlen)0);
+    nb = f2cmax(i__1,i__2);
+    nb = f2cmin(64,nb);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*n + nb - 1) / nb;
+    nba = f2cmax(i__1,i__2);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*nrhs + 31) / 32;
+    nbx = f2cmax(i__1,i__2);
+
+/*     Compute the workspace */
+
+/*     The workspace comprises two parts. */
+/*     The first part stores the local scale factors. Each simultaneously */
+/*     computed right-hand side requires one local scale factor per block */
+/*     row. WORK( I + KK * LDS ) is the scale factor of the vector */
+/*     segment associated with the I-th block row and the KK-th vector */
+/*     in the block column. */
+/* Computing MAX */
+    i__1 = nba, i__2 = f2cmin(*nrhs,32);
+    lscale = nba * f2cmax(i__1,i__2);
+    lds = nba;
+/*     The second part stores upper bounds of the triangular A. There are */
+/*     a total of NBA x NBA blocks, of which only the upper triangular */
+/*     part or the lower triangular part is referenced. The upper bound of */
+/*     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ). */
+    lanrm = nba * nba;
+    awrk = lscale;
+    work[1] = (doublereal) (lscale + lanrm);
+
+/*     Test the input parameters. */
+
+    if (! upper && ! lsame_(uplo, "L")) {
+	*info = -1;
+    } else if (! notran && ! lsame_(trans, "T") && ! 
+	    lsame_(trans, "C")) {
+	*info = -2;
+    } else if (! nounit && ! lsame_(diag, "U")) {
+	*info = -3;
+    } else if (! lsame_(normin, "Y") && ! lsame_(normin,
+	     "N")) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*nrhs < 0) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*ldx < f2cmax(1,*n)) {
+	*info = -10;
+    } else if (! lquery && (doublereal) (*lwork) < work[1]) {
+	*info = -14;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZLATRS3", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Initialize scaling factors */
+
+    i__1 = *nrhs;
+    for (kk = 1; kk <= i__1; ++kk) {
+	scale[kk] = 1.;
+    }
+
+/*     Quick return if possible */
+
+    if (f2cmin(*n,*nrhs) == 0) {
+	return 0;
+    }
+
+/*     Determine machine dependent constant to control overflow. */
+
+    bignum = dlamch_("Overflow");
+    smlnum = dlamch_("Safe Minimum");
+
+/*     Use unblocked code for small problems */
+
+    if (*nrhs < 2) {
+	zlatrs_(uplo, trans, diag, normin, n, &a[a_offset], lda, &x[x_dim1 + 
+		1], &scale[1], &cnorm[1], info);
+	i__1 = *nrhs;
+	for (k = 2; k <= i__1; ++k) {
+	    zlatrs_(uplo, trans, diag, "Y", n, &a[a_offset], lda, &x[k * 
+		    x_dim1 + 1], &scale[k], &cnorm[1], info);
+	}
+	return 0;
+    }
+
+/*     Compute norms of blocks of A excluding diagonal blocks and find */
+/*     the block with the largest norm TMAX. */
+
+    tmax = 0.;
+    i__1 = nba;
+    for (j = 1; j <= i__1; ++j) {
+	j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+	i__2 = j * nb;
+	j2 = f2cmin(i__2,*n) + 1;
+	if (upper) {
+	    ifirst = 1;
+	    ilast = j - 1;
+	} else {
+	    ifirst = j + 1;
+	    ilast = nba;
+	}
+	i__2 = ilast;
+	for (i__ = ifirst; i__ <= i__2; ++i__) {
+	    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+	    i__3 = i__ * nb;
+	    i2 = f2cmin(i__3,*n) + 1;
+
+/*           Compute upper bound of A( I1:I2-1, J1:J2-1 ). */
+
+	    if (notran) {
+		i__3 = i2 - i1;
+		i__4 = j2 - j1;
+		anrm = zlange_("I", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, 
+			w);
+		work[awrk + i__ + (j - 1) * nba] = anrm;
+	    } else {
+		i__3 = i2 - i1;
+		i__4 = j2 - j1;
+		anrm = zlange_("1", &i__3, &i__4, &a[i1 + j1 * a_dim1], lda, 
+			w);
+		work[awrk + j + (i__ - 1) * nba] = anrm;
+	    }
+	    tmax = f2cmax(tmax,anrm);
+	}
+    }
+
+    if (! (tmax <= dlamch_("Overflow"))) {
+
+/*        Some matrix entries have huge absolute value. At least one upper */
+/*        bound norm( A(I1:I2-1, J1:J2-1), 'I') is not a valid floating-point */
+/*        number, either due to overflow in LANGE or due to Inf in A. */
+/*        Fall back to LATRS. Set normin = 'N' for every right-hand side to */
+/*        force computation of TSCAL in LATRS to avoid the likely overflow */
+/*        in the computation of the column norms CNORM. */
+
+	i__1 = *nrhs;
+	for (k = 1; k <= i__1; ++k) {
+	    zlatrs_(uplo, trans, diag, "N", n, &a[a_offset], lda, &x[k * 
+		    x_dim1 + 1], &scale[k], &cnorm[1], info);
+	}
+	return 0;
+    }
+
+/*     Every right-hand side requires workspace to store NBA local scale */
+/*     factors. To save workspace, X is computed successively in block columns */
+/*     of width NBRHS, requiring a total of NBA x NBRHS space. If sufficient */
+/*     workspace is available, larger values of NBRHS or NBRHS = NRHS are viable. */
+    i__1 = nbx;
+    for (k = 1; k <= i__1; ++k) {
+/*        Loop over block columns (index = K) of X and, for column-wise scalings, */
+/*        over individual columns (index = KK). */
+/*        K1: column index of the first column in X( J, K ) */
+/*        K2: column index of the first column in X( J, K+1 ) */
+/*        so the K2 - K1 is the column count of the block X( J, K ) */
+	k1 = (k - 1 << 5) + 1;
+/* Computing MIN */
+	i__2 = k << 5;
+	k2 = f2cmin(i__2,*nrhs) + 1;
+
+/*        Initialize local scaling factors of current block column X( J, K ) */
+
+	i__2 = k2 - k1;
+	for (kk = 1; kk <= i__2; ++kk) {
+	    i__3 = nba;
+	    for (i__ = 1; i__ <= i__3; ++i__) {
+		work[i__ + kk * lds] = 1.;
+	    }
+	}
+
+	if (notran) {
+
+/*           Solve A * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */
+
+	    if (upper) {
+		jfirst = nba;
+		jlast = 1;
+		jinc = -1;
+	    } else {
+		jfirst = 1;
+		jlast = nba;
+		jinc = 1;
+	    }
+	} else {
+
+/*           Solve op(A) * X(:, K1:K2-1) = B * diag(scale(K1:K2-1)) */
+/*           where op(A) = A**T or op(A) = A**H */
+
+	    if (upper) {
+		jfirst = 1;
+		jlast = nba;
+		jinc = 1;
+	    } else {
+		jfirst = nba;
+		jlast = 1;
+		jinc = -1;
+	    }
+	}
+	i__2 = jlast;
+	i__3 = jinc;
+	for (j = jfirst; i__3 < 0 ? j >= i__2 : j <= i__2; j += i__3) {
+/*           J1: row index of the first row in A( J, J ) */
+/*           J2: row index of the first row in A( J+1, J+1 ) */
+/*           so that J2 - J1 is the row count of the block A( J, J ) */
+	    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+	    i__4 = j * nb;
+	    j2 = f2cmin(i__4,*n) + 1;
+
+/*           Solve op(A( J, J )) * X( J, RHS ) = SCALOC * B( J, RHS ) */
+
+	    i__4 = k2 - k1;
+	    for (kk = 1; kk <= i__4; ++kk) {
+		rhs = k1 + kk - 1;
+		if (kk == 1) {
+		    i__5 = j2 - j1;
+		    zlatrs_(uplo, trans, diag, "N", &i__5, &a[j1 + j1 * 
+			    a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, &
+			    cnorm[1], info);
+		} else {
+		    i__5 = j2 - j1;
+		    zlatrs_(uplo, trans, diag, "Y", &i__5, &a[j1 + j1 * 
+			    a_dim1], lda, &x[j1 + rhs * x_dim1], &scaloc, &
+			    cnorm[1], info);
+		}
+/*              Find largest absolute value entry in the vector segment */
+/*              X( J1:J2-1, RHS ) as an upper bound for the worst case */
+/*              growth in the linear updates. */
+		i__5 = j2 - j1;
+		xnrm[kk - 1] = zlange_("I", &i__5, &c__1, &x[j1 + rhs * 
+			x_dim1], ldx, w);
+
+		if (scaloc == 0.) {
+/*                 LATRS found that A is singular through A(j,j) = 0. */
+/*                 Reset the computation x(1:n) = 0, x(j) = 1, SCALE = 0 */
+/*                 and compute op(A)*x = 0. Note that X(J1:J2-1, KK) is */
+/*                 set by LATRS. */
+		    scale[rhs] = 0.;
+		    i__5 = j1 - 1;
+		    for (ii = 1; ii <= i__5; ++ii) {
+			i__6 = ii + kk * x_dim1;
+			x[i__6].r = 0., x[i__6].i = 0.;
+		    }
+		    i__5 = *n;
+		    for (ii = j2; ii <= i__5; ++ii) {
+			i__6 = ii + kk * x_dim1;
+			x[i__6].r = 0., x[i__6].i = 0.;
+		    }
+/*                 Discard the local scale factors. */
+		    i__5 = nba;
+		    for (ii = 1; ii <= i__5; ++ii) {
+			work[ii + kk * lds] = 1.;
+		    }
+		    scaloc = 1.;
+		} else if (scaloc * work[j + kk * lds] == 0.) {
+/*                 LATRS computed a valid scale factor, but combined with */
+/*                 the current scaling the solution does not have a */
+/*                 scale factor > 0. */
+
+/*                 Set WORK( J+KK*LDS ) to smallest valid scale */
+/*                 factor and increase SCALOC accordingly. */
+		    scal = work[j + kk * lds] / smlnum;
+		    scaloc *= scal;
+		    work[j + kk * lds] = smlnum;
+/*                 If LATRS overestimated the growth, x may be */
+/*                 rescaled to preserve a valid combined scale */
+/*                 factor WORK( J, KK ) > 0. */
+		    rscal = 1. / scaloc;
+		    if (xnrm[kk - 1] * rscal <= bignum) {
+			xnrm[kk - 1] *= rscal;
+			i__5 = j2 - j1;
+			zdscal_(&i__5, &rscal, &x[j1 + rhs * x_dim1], &c__1);
+			scaloc = 1.;
+		    } else {
+/*                    The system op(A) * x = b is badly scaled and its */
+/*                    solution cannot be represented as (1/scale) * x. */
+/*                    Set x to zero. This approach deviates from LATRS */
+/*                    where a completely meaningless non-zero vector */
+/*                    is returned that is not a solution to op(A) * x = b. */
+			scale[rhs] = 0.;
+			i__5 = *n;
+			for (ii = 1; ii <= i__5; ++ii) {
+			    i__6 = ii + kk * x_dim1;
+			    x[i__6].r = 0., x[i__6].i = 0.;
+			}
+/*                    Discard the local scale factors. */
+			i__5 = nba;
+			for (ii = 1; ii <= i__5; ++ii) {
+			    work[ii + kk * lds] = 1.;
+			}
+			scaloc = 1.;
+		    }
+		}
+		scaloc *= work[j + kk * lds];
+		work[j + kk * lds] = scaloc;
+	    }
+
+/*           Linear block updates */
+
+	    if (notran) {
+		if (upper) {
+		    ifirst = j - 1;
+		    ilast = 1;
+		    iinc = -1;
+		} else {
+		    ifirst = j + 1;
+		    ilast = nba;
+		    iinc = 1;
+		}
+	    } else {
+		if (upper) {
+		    ifirst = j + 1;
+		    ilast = nba;
+		    iinc = 1;
+		} else {
+		    ifirst = j - 1;
+		    ilast = 1;
+		    iinc = -1;
+		}
+	    }
+
+	    i__4 = ilast;
+	    i__5 = iinc;
+	    for (i__ = ifirst; i__5 < 0 ? i__ >= i__4 : i__ <= i__4; i__ += 
+		    i__5) {
+/*              I1: row index of the first column in X( I, K ) */
+/*              I2: row index of the first column in X( I+1, K ) */
+/*              so the I2 - I1 is the row count of the block X( I, K ) */
+		i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		i__6 = i__ * nb;
+		i2 = f2cmin(i__6,*n) + 1;
+
+/*              Prepare the linear update to be executed with GEMM. */
+/*              For each column, compute a consistent scaling, a */
+/*              scaling factor to survive the linear update, and */
+/*              rescale the column segments, if necesssary. Then */
+/*              the linear update is safely executed. */
+
+		i__6 = k2 - k1;
+		for (kk = 1; kk <= i__6; ++kk) {
+		    rhs = k1 + kk - 1;
+/*                 Compute consistent scaling */
+/* Computing MIN */
+		    d__1 = work[i__ + kk * lds], d__2 = work[j + kk * lds];
+		    scamin = f2cmin(d__1,d__2);
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__7 = i2 - i1;
+		    bnrm = zlange_("I", &i__7, &c__1, &x[i1 + rhs * x_dim1], 
+			    ldx, w);
+		    bnrm *= scamin / work[i__ + kk * lds];
+		    xnrm[kk - 1] *= scamin / work[j + kk * lds];
+		    anrm = work[awrk + i__ + (j - 1) * nba];
+		    scaloc = dlarmm_(&anrm, &xnrm[kk - 1], &bnrm);
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to X( I, KK ) and X( J, KK ). */
+
+		    scal = scamin / work[i__ + kk * lds] * scaloc;
+		    if (scal != 1.) {
+			i__7 = i2 - i1;
+			zdscal_(&i__7, &scal, &x[i1 + rhs * x_dim1], &c__1);
+			work[i__ + kk * lds] = scamin * scaloc;
+		    }
+
+		    scal = scamin / work[j + kk * lds] * scaloc;
+		    if (scal != 1.) {
+			i__7 = j2 - j1;
+			zdscal_(&i__7, &scal, &x[j1 + rhs * x_dim1], &c__1);
+			work[j + kk * lds] = scamin * scaloc;
+		    }
+		}
+
+		if (notran) {
+
+/*                 B( I, K ) := B( I, K ) - A( I, J ) * X( J, K ) */
+
+		    i__6 = i2 - i1;
+		    i__7 = k2 - k1;
+		    i__8 = j2 - j1;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("N", "N", &i__6, &i__7, &i__8, &z__1, &a[i1 + j1 * 
+			    a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, &
+			    x[i1 + k1 * x_dim1], ldx);
+		} else if (lsame_(trans, "T")) {
+
+/*                 B( I, K ) := B( I, K ) - A( I, J )**T * X( J, K ) */
+
+		    i__6 = i2 - i1;
+		    i__7 = k2 - k1;
+		    i__8 = j2 - j1;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("T", "N", &i__6, &i__7, &i__8, &z__1, &a[j1 + i1 * 
+			    a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, &
+			    x[i1 + k1 * x_dim1], ldx);
+		} else {
+
+/*                 B( I, K ) := B( I, K ) - A( I, J )**H * X( J, K ) */
+
+		    i__6 = i2 - i1;
+		    i__7 = k2 - k1;
+		    i__8 = j2 - j1;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("C", "N", &i__6, &i__7, &i__8, &z__1, &a[j1 + i1 * 
+			    a_dim1], lda, &x[j1 + k1 * x_dim1], ldx, &c_b1, &
+			    x[i1 + k1 * x_dim1], ldx);
+		}
+	    }
+	}
+
+/*        Reduce local scaling factors */
+
+	i__3 = k2 - k1;
+	for (kk = 1; kk <= i__3; ++kk) {
+	    rhs = k1 + kk - 1;
+	    i__2 = nba;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+/* Computing MIN */
+		d__1 = scale[rhs], d__2 = work[i__ + kk * lds];
+		scale[rhs] = f2cmin(d__1,d__2);
+	    }
+	}
+
+/*        Realize consistent scaling */
+
+	i__3 = k2 - k1;
+	for (kk = 1; kk <= i__3; ++kk) {
+	    rhs = k1 + kk - 1;
+	    if (scale[rhs] != 1. && scale[rhs] != 0.) {
+		i__2 = nba;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__5 = i__ * nb;
+		    i2 = f2cmin(i__5,*n) + 1;
+		    scal = scale[rhs] / work[i__ + kk * lds];
+		    if (scal != 1.) {
+			i__5 = i2 - i1;
+			zdscal_(&i__5, &scal, &x[i1 + rhs * x_dim1], &c__1);
+		    }
+		}
+	    }
+	}
+    }
+    return 0;
+
+/*     End of ZLATRS3 */
+
+} /* zlatrs3_ */
+
diff --git a/lapack-netlib/SRC/ztrsyl3.c b/lapack-netlib/SRC/ztrsyl3.c
new file mode 100644
index 000000000..d05923a46
--- /dev/null
+++ b/lapack-netlib/SRC/ztrsyl3.c
@@ -0,0 +1,381 @@
+/* f2c.h  --  Standard Fortran to C header file */
+
+/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
+
+	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
+
+#ifndef F2C_INCLUDE
+#define F2C_INCLUDE
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+typedef int integer;
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimag(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) ceil(w)
+#define myhuge_(w) HUGE_VAL
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif

From b6d74b7fff19988eb4345441ed1d805e9fbbc854 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 15 Nov 2022 16:26:44 +0100
Subject: [PATCH 096/154] Add f2c-converted files for the BLAS3-based Sylvester
 solver

---
 cmake/lapack.cmake | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index a78a89f1a..82511d41b 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -621,7 +621,8 @@ set(SLASRC
    ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c
    ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
    sgesvdq.c slaorhr_col_getrfnp.c
-   slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c )
+   slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c 
+   slarmm.c slatrs3.c strsyl3.c)
 
 set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
    sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
@@ -718,7 +719,8 @@ set(CLASRC
    cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c
    chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
    cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c 
-   cungtsqr.c cungtsqr_row.c cunhr_col.c )
+   cungtsqr.c cungtsqr_row.c cunhr_col.c 
+   clatrs3.c ctrsyl3.c)
 
 set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
    cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
@@ -809,7 +811,8 @@ set(DLASRC
    dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c
    dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
    dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
-   dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c )
+   dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c 
+   dlarmm.c dlatrs3.c dtrsyl3.c)
 
 set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
    dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
@@ -910,7 +913,7 @@ set(ZLASRC
    zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
    zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
    zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
-   zungtsqr.c zungtsqr_row.c zunhr_col.c)
+   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c)
 
 set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
    zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c

From 5dec93e93b38954154f3a8e12c905be101eddbe9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 15 Nov 2022 20:36:58 +0100
Subject: [PATCH 097/154] Complete the C conversion of the xTRSYL3 files

---
 lapack-netlib/SRC/ctrsyl3.c | 1518 ++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dtrsyl3.c | 1556 ++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/strsyl3.c | 1561 +++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/ztrsyl3.c | 1519 ++++++++++++++++++++++++++++++++++
 4 files changed, 6154 insertions(+)

diff --git a/lapack-netlib/SRC/ctrsyl3.c b/lapack-netlib/SRC/ctrsyl3.c
index d05923a46..70f265a14 100644
--- a/lapack-netlib/SRC/ctrsyl3.c
+++ b/lapack-netlib/SRC/ctrsyl3.c
@@ -157,6 +157,7 @@ struct Namelist {
 	};
 typedef struct Namelist Namelist;
 
+#define exponent(x) 
 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (fabs(x))
 #define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
@@ -233,7 +234,9 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define myhuge_(w) HUGE_VAL
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+#define myexp_(w) my_expfunc(w)
 
+static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;}
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
@@ -379,3 +382,1518 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 	pCd(z) = zdotc;
 }
 #endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static real c_b18 = 2.f;
+static real c_b106 = 1.f;
+
+/* > \brief \b CTRSYL3 */
+
+/* Definition: */
+/* =========== */
+
+
+/* >  \par Purpose */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* >  CTRSYL3 solves the complex Sylvester matrix equation: */
+/* > */
+/* >     op(A)*X + X*op(B) = scale*C or */
+/* >     op(A)*X - X*op(B) = scale*C, */
+/* > */
+/* >  where op(A) = A or A**H, and  A and B are both upper triangular. A is */
+/* >  M-by-M and B is N-by-N; the right hand side C and the solution X are */
+/* >  M-by-N; and scale is an output scale factor, set <= 1 to avoid */
+/* >  overflow in X. */
+/* > */
+/* >  This is the block version of the algorithm. */
+/* > \endverbatim */
+
+/*  Arguments */
+/*  ========= */
+
+/* > \param[in] TRANA */
+/* > \verbatim */
+/* >          TRANA is CHARACTER*1 */
+/* >          Specifies the option op(A): */
+/* >          = 'N': op(A) = A    (No transpose) */
+/* >          = 'C': op(A) = A**H (Conjugate transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TRANB */
+/* > \verbatim */
+/* >          TRANB is CHARACTER*1 */
+/* >          Specifies the option op(B): */
+/* >          = 'N': op(B) = B    (No transpose) */
+/* >          = 'C': op(B) = B**H (Conjugate transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] ISGN */
+/* > \verbatim */
+/* >          ISGN is INTEGER */
+/* >          Specifies the sign in the equation: */
+/* >          = +1: solve op(A)*X + X*op(B) = scale*C */
+/* >          = -1: solve op(A)*X - X*op(B) = scale*C */
+/* > \endverbatim */
+/* > */
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The order of the matrix A, and the number of rows in the */
+/* >          matrices X and C. M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The order of the matrix B, and the number of columns in the */
+/* >          matrices X and C. N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX array, dimension (LDA,M) */
+/* >          The upper triangular matrix A. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A. LDA >= f2cmax(1,M). */
+/* > \endverbatim */
+/* > */
+/* > \param[in] B */
+/* > \verbatim */
+/* >          B is COMPLEX array, dimension (LDB,N) */
+/* >          The upper triangular matrix B. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= f2cmax(1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] C */
+/* > \verbatim */
+/* >          C is COMPLEX array, dimension (LDC,N) */
+/* >          On entry, the M-by-N right hand side matrix C. */
+/* >          On exit, C is overwritten by the solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDC */
+/* > \verbatim */
+/* >          LDC is INTEGER */
+/* >          The leading dimension of the array C. LDC >= f2cmax(1,M) */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SCALE */
+/* > \verbatim */
+/* >          SCALE is REAL */
+/* >          The scale factor, scale, set <= 1 to avoid overflow in X. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SWORK */
+/* > \verbatim */
+/* >          SWORK is REAL array, dimension (MAX(2, ROWS), MAX(1,COLS)). */
+/* >          On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */
+/* >          and SWORK(2) returns the optimal COLS. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDSWORK */
+/* > \verbatim */
+/* >          LDSWORK is INTEGER */
+/* >          LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */
+/* >          and NB is the optimal block size. */
+/* > */
+/* >          If LDSWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal dimensions of the SWORK matrix, */
+/* >          returns these values as the first and second entry of the SWORK */
+/* >          matrix, and no error message related LWORK is issued by XERBLA. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* >          = 1: A and B have common or very close eigenvalues; perturbed */
+/* >               values were used to solve the equation (but the matrices */
+/* >               A and B are unchanged). */
+/* > \endverbatim */
+
+/* > \ingroup complexSYcomputational */
+
+/*  ===================================================================== */
+/*  References: */
+/*   E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */
+/*   algorithms: The triangular Sylvester equation, ACM Transactions */
+/*   on Mathematical Software (TOMS), volume 29, pages 218--243. */
+
+/*   A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */
+/*   Solution of the Triangular Sylvester Equation. Lecture Notes in */
+/*   Computer Science, vol 12043, pages 82--92, Springer. */
+
+/*  Contributor: */
+/*   Angelika Schwarz, Umea University, Sweden. */
+
+/*  ===================================================================== */
+/* Subroutine */ int ctrsyl3_(char *trana, char *tranb, integer *isgn, 
+	integer *m, integer *n, complex *a, integer *lda, complex *b, integer 
+	*ldb, complex *c__, integer *ldc, real *scale, real *swork, integer *
+	ldswork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, 
+	    swork_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    real r__1, r__2, r__3, r__4;
+    complex q__1;
+
+    /* Local variables */
+    real scal;
+    complex csgn;
+    real anrm, bnrm, cnrm;
+    integer awrk, bwrk;
+    real *wnrm, xnrm;
+    integer i__, j, k, l;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *);
+    extern logical lsame_(char *, char *);
+    integer iinfo, i1, i2, j1, j2, k1, k2, l1, l2;
+//    extern integer myexp_(real *);
+    integer nb, jj, ll;
+    extern real clange_(char *, integer *, integer *, complex *, integer *, 
+	    real *);
+    extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, 
+	    real *, integer *, integer *, complex *, integer *, integer *);
+    real scaloc;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
+	    *);
+    real scamin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    real bignum;
+    extern real slarmm_(real *, real *, real *);
+    logical notrna, notrnb;
+    real smlnum;
+    extern /* Subroutine */ int ctrsyl_(char *, char *, integer *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, complex *, 
+	    integer *, real *, integer *);
+    logical lquery;
+    integer nba, nbb;
+    real buf, sgn;
+
+
+
+/*     Decode and Test input parameters */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    swork_dim1 = *ldswork;
+    swork_offset = 1 + swork_dim1 * 1;
+    swork -= swork_offset;
+
+    /* Function Body */
+    notrna = lsame_(trana, "N");
+    notrnb = lsame_(tranb, "N");
+
+/*     Use the same block size for all matrices. */
+
+/* Computing MAX */
+    i__1 = 8, i__2 = ilaenv_(&c__1, "CTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen)
+	    6, (ftnlen)0);
+    nb = f2cmax(i__1,i__2);
+
+/*     Compute number of blocks in A and B */
+
+/* Computing MAX */
+    i__1 = 1, i__2 = (*m + nb - 1) / nb;
+    nba = f2cmax(i__1,i__2);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*n + nb - 1) / nb;
+    nbb = f2cmax(i__1,i__2);
+
+/*     Compute workspace */
+
+    *info = 0;
+    lquery = *ldswork == -1;
+    if (lquery) {
+	*ldswork = 2;
+	swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb);
+	swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba);
+    }
+
+/*     Test the input arguments */
+
+    if (! notrna && ! lsame_(trana, "C")) {
+	*info = -1;
+    } else if (! notrnb && ! lsame_(tranb, "C")) {
+	*info = -2;
+    } else if (*isgn != 1 && *isgn != -1) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -7;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -9;
+    } else if (*ldc < f2cmax(1,*m)) {
+	*info = -11;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CTRSYL3", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    *scale = 1.f;
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    wnrm = (real*)malloc(f2cmax(*m,*n)*sizeof(real));
+/*     Use unblocked code for small problems or if insufficient */
+/*     workspace is provided */
+
+    if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb)) {
+	ctrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], 
+		ldb, &c__[c_offset], ldc, scale, info);
+	return 0;
+    }
+
+/*     Set constants to control overflow */
+
+    smlnum = slamch_("S");
+    bignum = 1.f / smlnum;
+
+/*     Set local scaling factors. */
+
+    i__1 = nbb;
+    for (l = 1; l <= i__1; ++l) {
+	i__2 = nba;
+	for (k = 1; k <= i__2; ++k) {
+	    swork[k + l * swork_dim1] = 1.f;
+	}
+    }
+
+/*     Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */
+/*     This scaling is to ensure compatibility with TRSYL and may get flushed. */
+
+    buf = 1.f;
+
+/*      Compute upper bounds of blocks of A and B */
+
+    awrk = nbb;
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	i__2 = k * nb;
+	k2 = f2cmin(i__2,*m) + 1;
+	i__2 = nba;
+	for (l = k; l <= i__2; ++l) {
+	    l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+	    i__3 = l * nb;
+	    l2 = f2cmin(i__3,*m) + 1;
+	    if (notrna) {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[k + (awrk + l) * swork_dim1] = clange_("I", &i__3, &
+			i__4, &a[k1 + l1 * a_dim1], lda, wnrm);
+	    } else {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[l + (awrk + k) * swork_dim1] = clange_("1", &i__3, &
+			i__4, &a[k1 + l1 * a_dim1], lda, wnrm);
+	    }
+	}
+    }
+    bwrk = nbb + nba;
+    i__1 = nbb;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	i__2 = k * nb;
+	k2 = f2cmin(i__2,*n) + 1;
+	i__2 = nbb;
+	for (l = k; l <= i__2; ++l) {
+	    l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+	    i__3 = l * nb;
+	    l2 = f2cmin(i__3,*n) + 1;
+	    if (notrnb) {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[k + (bwrk + l) * swork_dim1] = clange_("I", &i__3, &
+			i__4, &b[k1 + l1 * b_dim1], ldb, wnrm);
+	    } else {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[l + (bwrk + k) * swork_dim1] = clange_("1", &i__3, &
+			i__4, &b[k1 + l1 * b_dim1], ldb, wnrm);
+	    }
+	}
+    }
+
+    sgn = (real) (*isgn);
+    q__1.r = sgn, q__1.i = 0.f;
+    csgn.r = q__1.r, csgn.i = q__1.i;
+
+    if (notrna && notrnb) {
+
+/*        Solve    A*X + ISGN*X*B = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        bottom-left corner column by column by */
+
+/*         A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                  M                         L-1 */
+/*        R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */
+/*                I=K+1                       J=1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	for (k = nba; k >= 1; --k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	    i__1 = k * nb;
+	    k2 = f2cmin(i__1,*m) + 1;
+	    i__1 = nbb;
+	    for (l = 1; l <= i__1; ++l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+		i__2 = l * nb;
+		l2 = f2cmin(i__2,*n) + 1;
+
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		ctrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.f) {
+		    if (scaloc == 0.f) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.f;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__2);
+		    }
+		    i__2 = nbb;
+		    for (jj = 1; jj <= i__2; ++jj) {
+			i__3 = nba;
+			for (ll = 1; ll <= i__3; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__4 = myexp_(&scaloc);
+			    r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] 
+				    / pow_ri(&c_b18, &i__4);
+			    swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		xnrm = clange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		for (i__ = k - 1; i__ >= 1; --i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */
+
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__2 = i__ * nb;
+		    i2 = f2cmin(i__2,*m) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    cnrm = clange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = slarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b18, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b18, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b18, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = l2 - 1;
+			for (jj = l1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = i2 - i1;
+			    csscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    i__4 = k2 - k1;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("N", "N", &i__2, &i__3, &i__4, &q__1, &a[i1 + k1 * 
+			    a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, 
+			    &c__[i1 + l1 * c_dim1], ldc)
+			    ;
+
+		}
+
+		i__2 = nbb;
+		for (j = l + 1; j <= i__2; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */
+
+		    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+		    i__3 = j * nb;
+		    j2 = f2cmin(i__3,*n) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    cnrm = clange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = slarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b18, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b18, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b18, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = j2 - 1;
+			for (jj = j1; jj <= i__3; ++jj) {
+			    i__4 = k2 - k1;
+			    csscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    i__5 = l2 - l1;
+		    q__1.r = -csgn.r, q__1.i = -csgn.i;
+		    cgemm_("N", "N", &i__3, &i__4, &i__5, &q__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, 
+			    &c__[k1 + j1 * c_dim1], ldc)
+			    ;
+		}
+	    }
+	}
+    } else if (! notrna && notrnb) {
+
+/*        Solve    A**H *X + ISGN*X*B = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        upper-left corner column by column by */
+
+/*          A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                   K-1                        L-1 */
+/*          R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */
+/*                   I=1                        J=1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	i__1 = nba;
+	for (k = 1; k <= i__1; ++k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	    i__2 = k * nb;
+	    k2 = f2cmin(i__2,*m) + 1;
+	    i__2 = nbb;
+	    for (l = 1; l <= i__2; ++l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+		i__3 = l * nb;
+		l2 = f2cmin(i__3,*n) + 1;
+
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		ctrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.f) {
+		    if (scaloc == 0.f) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.f;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__3);
+		    }
+		    i__3 = nbb;
+		    for (jj = 1; jj <= i__3; ++jj) {
+			i__4 = nba;
+			for (ll = 1; ll <= i__4; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__5 = myexp_(&scaloc);
+			    r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] 
+				    / pow_ri(&c_b18, &i__5);
+			    swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		xnrm = clange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__3 = nba;
+		for (i__ = k + 1; i__ <= i__3; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */
+
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__4 = i__ * nb;
+		    i2 = f2cmin(i__4,*m) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__4 = i2 - i1;
+		    i__5 = l2 - l1;
+		    cnrm = clange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = slarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__4 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__4);
+			i__4 = nbb;
+			for (jj = 1; jj <= i__4; ++jj) {
+			    i__5 = nba;
+			    for (ll = 1; ll <= i__5; ++ll) {
+/* Computing MIN */
+				i__6 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b18, &i__6);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__4 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b18, &i__4);
+			i__4 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b18, &i__4);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to to C( I, L ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = k2 - k1;
+			    csscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = i2 - i1;
+			    csscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__4 = i2 - i1;
+		    i__5 = l2 - l1;
+		    i__6 = k2 - k1;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("C", "N", &i__4, &i__5, &i__6, &q__1, &a[k1 + i1 * 
+			    a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, 
+			    &c__[i1 + l1 * c_dim1], ldc)
+			    ;
+		}
+
+		i__3 = nbb;
+		for (j = l + 1; j <= i__3; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */
+
+		    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+		    i__4 = j * nb;
+		    j2 = f2cmin(i__4,*n) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__4 = k2 - k1;
+		    i__5 = j2 - j1;
+		    cnrm = clange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = slarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__4 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__4);
+			i__4 = nbb;
+			for (jj = 1; jj <= i__4; ++jj) {
+			    i__5 = nba;
+			    for (ll = 1; ll <= i__5; ++ll) {
+/* Computing MIN */
+				i__6 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b18, &i__6);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__4 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b18, &i__4);
+			i__4 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b18, &i__4);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to to C( K, J ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = k2 - k1;
+			    csscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__4 = j2 - 1;
+			for (jj = j1; jj <= i__4; ++jj) {
+			    i__5 = k2 - k1;
+			    csscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__4 = k2 - k1;
+		    i__5 = j2 - j1;
+		    i__6 = l2 - l1;
+		    q__1.r = -csgn.r, q__1.i = -csgn.i;
+		    cgemm_("N", "N", &i__4, &i__5, &i__6, &q__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, 
+			    &c__[k1 + j1 * c_dim1], ldc)
+			    ;
+		}
+	    }
+	}
+    } else if (! notrna && ! notrnb) {
+
+/*        Solve    A**H *X + ISGN*X*B**H = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        top-right corner column by column by */
+
+/*           A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                     K-1                          N */
+/*            R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */
+/*                     I=1                        J=L+1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	i__1 = nba;
+	for (k = 1; k <= i__1; ++k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	    i__2 = k * nb;
+	    k2 = f2cmin(i__2,*m) + 1;
+	    for (l = nbb; l >= 1; --l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+		i__2 = l * nb;
+		l2 = f2cmin(i__2,*n) + 1;
+
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		ctrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.f) {
+		    if (scaloc == 0.f) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.f;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__2);
+		    }
+		    i__2 = nbb;
+		    for (jj = 1; jj <= i__2; ++jj) {
+			i__3 = nba;
+			for (ll = 1; ll <= i__3; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__4 = myexp_(&scaloc);
+			    r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] 
+				    / pow_ri(&c_b18, &i__4);
+			    swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		xnrm = clange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__2 = nba;
+		for (i__ = k + 1; i__ <= i__2; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */
+
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__3 = i__ * nb;
+		    i2 = f2cmin(i__3,*m) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = i2 - i1;
+		    i__4 = l2 - l1;
+		    cnrm = clange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = slarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b18, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b18, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b18, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = i2 - i1;
+			    csscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__3 = i2 - i1;
+		    i__4 = l2 - l1;
+		    i__5 = k2 - k1;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("C", "N", &i__3, &i__4, &i__5, &q__1, &a[k1 + i1 * 
+			    a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, 
+			    &c__[i1 + l1 * c_dim1], ldc)
+			    ;
+		}
+
+		i__2 = l - 1;
+		for (j = 1; j <= i__2; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */
+
+		    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+		    i__3 = j * nb;
+		    j2 = f2cmin(i__3,*n) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    cnrm = clange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = slarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b18, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b18, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b18, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = j2 - 1;
+			for (jj = j1; jj <= i__3; ++jj) {
+			    i__4 = k2 - k1;
+			    csscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    i__5 = l2 - l1;
+		    q__1.r = -csgn.r, q__1.i = -csgn.i;
+		    cgemm_("N", "C", &i__3, &i__4, &i__5, &q__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, 
+			    &c__[k1 + j1 * c_dim1], ldc)
+			    ;
+		}
+	    }
+	}
+    } else if (notrna && ! notrnb) {
+
+/*        Solve    A*X + ISGN*X*B**H = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        bottom-right corner column by column by */
+
+/*            A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                      M                          N */
+/*            R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */
+/*                    I=K+1                      J=L+1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	for (k = nba; k >= 1; --k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	    i__1 = k * nb;
+	    k2 = f2cmin(i__1,*m) + 1;
+	    for (l = nbb; l >= 1; --l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+		i__1 = l * nb;
+		l2 = f2cmin(i__1,*n) + 1;
+
+		i__1 = k2 - k1;
+		i__2 = l2 - l1;
+		ctrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.f) {
+		    if (scaloc == 0.f) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.f;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__1 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__1);
+		    }
+		    i__1 = nbb;
+		    for (jj = 1; jj <= i__1; ++jj) {
+			i__2 = nba;
+			for (ll = 1; ll <= i__2; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__3 = myexp_(&scaloc);
+			    r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] 
+				    / pow_ri(&c_b18, &i__3);
+			    swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__1 = k2 - k1;
+		i__2 = l2 - l1;
+		xnrm = clange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__1 = k - 1;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */
+
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__2 = i__ * nb;
+		    i2 = f2cmin(i__2,*m) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    cnrm = clange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = slarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b18, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b18, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b18, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = k2 - k1;
+			    csscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = i2 - i1;
+			    csscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    i__4 = k2 - k1;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("N", "N", &i__2, &i__3, &i__4, &q__1, &a[i1 + k1 * 
+			    a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, 
+			    &c__[i1 + l1 * c_dim1], ldc)
+			    ;
+
+		}
+
+		i__1 = l - 1;
+		for (j = 1; j <= i__1; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */
+
+		    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+		    i__2 = j * nb;
+		    j2 = f2cmin(i__2,*n) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = k2 - k1;
+		    i__3 = j2 - j1;
+		    cnrm = clange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = slarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b18, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b18, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b18, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b18, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = l2 - 1;
+			for (jj = l1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = j2 - 1;
+			for (jj = j1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    csscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__2 = k2 - k1;
+		    i__3 = j2 - j1;
+		    i__4 = l2 - l1;
+		    q__1.r = -csgn.r, q__1.i = -csgn.i;
+		    cgemm_("N", "C", &i__2, &i__3, &i__4, &q__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, 
+			    &c__[k1 + j1 * c_dim1], ldc)
+			    ;
+		}
+	    }
+	}
+
+    }
+
+    free(wnrm);
+
+/*     Reduce local scaling factors */
+
+    *scale = swork[swork_dim1 + 1];
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	i__2 = nbb;
+	for (l = 1; l <= i__2; ++l) {
+/* Computing MIN */
+	    r__1 = *scale, r__2 = swork[k + l * swork_dim1];
+	    *scale = f2cmin(r__1,r__2);
+	}
+    }
+    if (*scale == 0.f) {
+
+/*        The magnitude of the largest entry of the solution is larger */
+/*        than the product of BIGNUM**2 and cannot be represented in the */
+/*        form (1/SCALE)*X if SCALE is REAL. Set SCALE to */
+/*        zero and give up. */
+
+	swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb);
+	swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba);
+	return 0;
+    }
+
+/*     Realize consistent scaling */
+
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	i__2 = k * nb;
+	k2 = f2cmin(i__2,*m) + 1;
+	i__2 = nbb;
+	for (l = 1; l <= i__2; ++l) {
+	    l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+	    i__3 = l * nb;
+	    l2 = f2cmin(i__3,*n) + 1;
+	    scal = *scale / swork[k + l * swork_dim1];
+	    if (scal != 1.f) {
+		i__3 = l2 - 1;
+		for (ll = l1; ll <= i__3; ++ll) {
+		    i__4 = k2 - k1;
+		    csscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1);
+		}
+	    }
+	}
+    }
+
+    if (buf != 1.f && buf > 0.f) {
+
+/*        Decrease SCALE as much as possible. */
+
+/* Computing MIN */
+	r__1 = *scale / smlnum, r__2 = 1.f / buf;
+	scaloc = f2cmin(r__1,r__2);
+	buf *= scaloc;
+	*scale /= scaloc;
+    }
+
+    if (buf != 1.f && buf > 0.f) {
+
+/*        In case of overly aggressive scaling during the computation, */
+/*        flushing of the global scale factor may be prevented by */
+/*        undoing some of the scaling. This step is to ensure that */
+/*        this routine flushes only scale factors that TRSYL also */
+/*        flushes and be usable as a drop-in replacement. */
+
+/*        How much can the normwise largest entry be upscaled? */
+
+/* Computing MAX */
+	i__1 = c_dim1 + 1;
+	r__3 = (r__1 = c__[i__1].r, abs(r__1)), r__4 = (r__2 = r_imag(&c__[
+		c_dim1 + 1]), abs(r__2));
+	scal = f2cmax(r__3,r__4);
+	i__1 = *m;
+	for (k = 1; k <= i__1; ++k) {
+	    i__2 = *n;
+	    for (l = 1; l <= i__2; ++l) {
+/* Computing MAX */
+		i__3 = k + l * c_dim1;
+		r__3 = scal, r__4 = (r__1 = c__[i__3].r, abs(r__1)), r__3 = 
+			f2cmax(r__3,r__4), r__4 = (r__2 = r_imag(&c__[k + l * 
+			c_dim1]), abs(r__2));
+		scal = f2cmax(r__3,r__4);
+	    }
+	}
+
+/*        Increase BUF as close to 1 as possible and apply scaling. */
+
+/* Computing MIN */
+	r__1 = bignum / scal, r__2 = 1.f / buf;
+	scaloc = f2cmin(r__1,r__2);
+	buf *= scaloc;
+	clascl_("G", &c_n1, &c_n1, &c_b106, &scaloc, m, n, &c__[c_offset], 
+		ldc, &iinfo);
+    }
+
+/*     Combine with buffer scaling factor. SCALE will be flushed if */
+/*     BUF is less than one here. */
+
+    *scale *= buf;
+
+/*     Restore workspace dimensions */
+
+    swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb);
+    swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba);
+
+    return 0;
+
+/*     End of CTRSYL3 */
+
+} /* ctrsyl3_ */
+
diff --git a/lapack-netlib/SRC/dtrsyl3.c b/lapack-netlib/SRC/dtrsyl3.c
index d05923a46..199baab75 100644
--- a/lapack-netlib/SRC/dtrsyl3.c
+++ b/lapack-netlib/SRC/dtrsyl3.c
@@ -157,6 +157,7 @@ struct Namelist {
 	};
 typedef struct Namelist Namelist;
 
+#define exponent(x) 
 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (fabs(x))
 #define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
@@ -233,7 +234,9 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define myhuge_(w) HUGE_VAL
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+#define myexp_(w) my_expfunc(w)
 
+static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;}
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
@@ -379,3 +382,1556 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 	pCd(z) = zdotc;
 }
 #endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static doublereal c_b19 = 2.;
+static doublereal c_b31 = -1.;
+static doublereal c_b32 = 1.;
+
+/* > \brief \b DTRSYL3 */
+
+/* Definition: */
+/* =========== */
+
+
+/* >  \par Purpose */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* >  DTRSYL3 solves the real Sylvester matrix equation: */
+/* > */
+/* >     op(A)*X + X*op(B) = scale*C or */
+/* >     op(A)*X - X*op(B) = scale*C, */
+/* > */
+/* >  where op(A) = A or A**T, and  A and B are both upper quasi- */
+/* >  triangular. A is M-by-M and B is N-by-N; the right hand side C and */
+/* >  the solution X are M-by-N; and scale is an output scale factor, set */
+/* >  <= 1 to avoid overflow in X. */
+/* > */
+/* >  A and B must be in Schur canonical form (as returned by DHSEQR), that */
+/* >  is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; */
+/* >  each 2-by-2 diagonal block has its diagonal elements equal and its */
+/* >  off-diagonal elements of opposite sign. */
+/* > */
+/* >  This is the block version of the algorithm. */
+/* > \endverbatim */
+
+/*  Arguments */
+/*  ========= */
+
+/* > \param[in] TRANA */
+/* > \verbatim */
+/* >          TRANA is CHARACTER*1 */
+/* >          Specifies the option op(A): */
+/* >          = 'N': op(A) = A    (No transpose) */
+/* >          = 'T': op(A) = A**T (Transpose) */
+/* >          = 'C': op(A) = A**H (Conjugate transpose = Transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TRANB */
+/* > \verbatim */
+/* >          TRANB is CHARACTER*1 */
+/* >          Specifies the option op(B): */
+/* >          = 'N': op(B) = B    (No transpose) */
+/* >          = 'T': op(B) = B**T (Transpose) */
+/* >          = 'C': op(B) = B**H (Conjugate transpose = Transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] ISGN */
+/* > \verbatim */
+/* >          ISGN is INTEGER */
+/* >          Specifies the sign in the equation: */
+/* >          = +1: solve op(A)*X + X*op(B) = scale*C */
+/* >          = -1: solve op(A)*X - X*op(B) = scale*C */
+/* > \endverbatim */
+/* > */
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The order of the matrix A, and the number of rows in the */
+/* >          matrices X and C. M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The order of the matrix B, and the number of columns in the */
+/* >          matrices X and C. N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is DOUBLE PRECISION array, dimension (LDA,M) */
+/* >          The upper quasi-triangular matrix A, in Schur canonical form. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A. LDA >= f2cmax(1,M). */
+/* > \endverbatim */
+/* > */
+/* > \param[in] B */
+/* > \verbatim */
+/* >          B is DOUBLE PRECISION array, dimension (LDB,N) */
+/* >          The upper quasi-triangular matrix B, in Schur canonical form. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= f2cmax(1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] C */
+/* > \verbatim */
+/* >          C is DOUBLE PRECISION array, dimension (LDC,N) */
+/* >          On entry, the M-by-N right hand side matrix C. */
+/* >          On exit, C is overwritten by the solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDC */
+/* > \verbatim */
+/* >          LDC is INTEGER */
+/* >          The leading dimension of the array C. LDC >= f2cmax(1,M) */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SCALE */
+/* > \verbatim */
+/* >          SCALE is DOUBLE PRECISION */
+/* >          The scale factor, scale, set <= 1 to avoid overflow in X. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] IWORK */
+/* > \verbatim */
+/* >          IWORK is INTEGER array, dimension (MAX(1,LIWORK)) */
+/* >          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LIWORK */
+/* > \verbatim */
+/* >          IWORK is INTEGER */
+/* >          The dimension of the array IWORK. LIWORK >=  ((M + NB - 1) / NB + 1) */
+/* >          + ((N + NB - 1) / NB + 1), where NB is the optimal block size. */
+/* > */
+/* >          If LIWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal dimension of the IWORK array, */
+/* >          returns this value as the first entry of the IWORK array, and */
+/* >          no error message related to LIWORK is issued by XERBLA. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SWORK */
+/* > \verbatim */
+/* >          SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), */
+/* >          MAX(1,COLS)). */
+/* >          On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */
+/* >          and SWORK(2) returns the optimal COLS. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDSWORK */
+/* > \verbatim */
+/* >          LDSWORK is INTEGER */
+/* >          LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */
+/* >          and NB is the optimal block size. */
+/* > */
+/* >          If LDSWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal dimensions of the SWORK matrix, */
+/* >          returns these values as the first and second entry of the SWORK */
+/* >          matrix, and no error message related LWORK is issued by XERBLA. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* >          = 1: A and B have common or very close eigenvalues; perturbed */
+/* >               values were used to solve the equation (but the matrices */
+/* >               A and B are unchanged). */
+/* > \endverbatim */
+
+/*  ===================================================================== */
+/*  References: */
+/*   E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */
+/*   algorithms: The triangular Sylvester equation, ACM Transactions */
+/*   on Mathematical Software (TOMS), volume 29, pages 218--243. */
+
+/*   A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */
+/*   Solution of the Triangular Sylvester Equation. Lecture Notes in */
+/*   Computer Science, vol 12043, pages 82--92, Springer. */
+
+/*  Contributor: */
+/*   Angelika Schwarz, Umea University, Sweden. */
+
+/*  ===================================================================== */
+/* Subroutine */ int dtrsyl3_(char *trana, char *tranb, integer *isgn, 
+	integer *m, integer *n, doublereal *a, integer *lda, doublereal *b, 
+	integer *ldb, doublereal *c__, integer *ldc, doublereal *scale, 
+	integer *iwork, integer *liwork, doublereal *swork, integer *ldswork, 
+	integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, 
+	    swork_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    doublereal d__1, d__2, d__3;
+
+    /* Local variables */
+    doublereal scal, anrm, bnrm, cnrm;
+    integer awrk, bwrk;
+    logical skip;
+    doublereal *wnrm, xnrm;
+    integer i__, j, k, l;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, 
+	    integer *), dgemm_(char *, char *, integer *, integer *, integer *
+	    , doublereal *, doublereal *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, integer *);
+    extern logical lsame_(char *, char *);
+    integer iinfo, i1, i2, j1, j2, k1, k2, l1;
+//    extern integer myexp_(doublereal *);
+    integer l2, nb, pc, jj, ll;
+    extern doublereal dlamch_(char *), dlange_(char *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *);
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *, 
+	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
+	    integer *, integer *);
+    doublereal scaloc, scamin;
+    extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    doublereal bignum;
+    logical notrna, notrnb;
+    doublereal smlnum;
+    logical lquery;
+    extern /* Subroutine */ int dtrsyl_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *);
+    integer nba, nbb;
+    doublereal buf, sgn;
+
+
+/*     Decode and Test input parameters */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --iwork;
+    swork_dim1 = *ldswork;
+    swork_offset = 1 + swork_dim1 * 1;
+    swork -= swork_offset;
+
+    /* Function Body */
+    notrna = lsame_(trana, "N");
+    notrnb = lsame_(tranb, "N");
+
+/*     Use the same block size for all matrices. */
+
+/* Computing MAX */
+    i__1 = 8, i__2 = ilaenv_(&c__1, "DTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen)
+	    6, (ftnlen)0);
+    nb = f2cmax(i__1,i__2);
+
+/*     Compute number of blocks in A and B */
+
+/* Computing MAX */
+    i__1 = 1, i__2 = (*m + nb - 1) / nb;
+    nba = f2cmax(i__1,i__2);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*n + nb - 1) / nb;
+    nbb = f2cmax(i__1,i__2);
+
+/*     Compute workspace */
+
+    *info = 0;
+    lquery = *liwork == -1 || *ldswork == -1;
+    iwork[1] = nba + nbb + 2;
+    if (lquery) {
+	*ldswork = 2;
+	swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb);
+	swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba);
+    }
+
+/*     Test the input arguments */
+
+    if (! notrna && ! lsame_(trana, "T") && ! lsame_(
+	    trana, "C")) {
+	*info = -1;
+    } else if (! notrnb && ! lsame_(tranb, "T") && ! 
+	    lsame_(tranb, "C")) {
+	*info = -2;
+    } else if (*isgn != 1 && *isgn != -1) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -7;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -9;
+    } else if (*ldc < f2cmax(1,*m)) {
+	*info = -11;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DTRSYL3", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    *scale = 1.;
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+     wnrm = (doublereal*)malloc(f2cmax(*m,*n)*sizeof(doublereal));
+/*     Use unblocked code for small problems or if insufficient */
+/*     workspaces are provided */
+
+    if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb) || *liwork < iwork[1]) {
+	dtrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], 
+		ldb, &c__[c_offset], ldc, scale, info);
+	return 0;
+    }
+
+/*     Set constants to control overflow */
+
+    smlnum = dlamch_("S");
+    bignum = 1. / smlnum;
+
+/*      Partition A such that 2-by-2 blocks on the diagonal are not split */
+
+    skip = FALSE_;
+    i__1 = nba;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	iwork[i__] = (i__ - 1) * nb + 1;
+    }
+    iwork[nba + 1] = *m + 1;
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	l1 = iwork[k];
+	l2 = iwork[k + 1] - 1;
+	i__2 = l2;
+	for (l = l1; l <= i__2; ++l) {
+	    if (skip) {
+		skip = FALSE_;
+		mycycle_();
+	    }
+	    if (l >= *m) {
+/*               A( M, M ) is a 1-by-1 block */
+		mycycle_();
+	    }
+	    if (a[l + (l + 1) * a_dim1] != 0. && a[l + 1 + l * a_dim1] != 0.) 
+		    {
+/*               Check if 2-by-2 block is split */
+		if (l + 1 == iwork[k + 1]) {
+		    ++iwork[k + 1];
+		    mycycle_();
+		}
+		skip = TRUE_;
+	    }
+	}
+    }
+    iwork[nba + 1] = *m + 1;
+    if (iwork[nba] >= iwork[nba + 1]) {
+	iwork[nba] = iwork[nba + 1];
+	--nba;
+    }
+
+/*      Partition B such that 2-by-2 blocks on the diagonal are not split */
+
+    pc = nba + 1;
+    skip = FALSE_;
+    i__1 = nbb;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	iwork[pc + i__] = (i__ - 1) * nb + 1;
+    }
+    iwork[pc + nbb + 1] = *n + 1;
+    i__1 = nbb;
+    for (k = 1; k <= i__1; ++k) {
+	l1 = iwork[pc + k];
+	l2 = iwork[pc + k + 1] - 1;
+	i__2 = l2;
+	for (l = l1; l <= i__2; ++l) {
+	    if (skip) {
+		skip = FALSE_;
+		mycycle_();
+	    }
+	    if (l >= *n) {
+/*               B( N, N ) is a 1-by-1 block */
+		mycycle_();
+	    }
+	    if (b[l + (l + 1) * b_dim1] != 0. && b[l + 1 + l * b_dim1] != 0.) 
+		    {
+/*               Check if 2-by-2 block is split */
+		if (l + 1 == iwork[pc + k + 1]) {
+		    ++iwork[pc + k + 1];
+		    mycycle_();
+		}
+		skip = TRUE_;
+	    }
+	}
+    }
+    iwork[pc + nbb + 1] = *n + 1;
+    if (iwork[pc + nbb] >= iwork[pc + nbb + 1]) {
+	iwork[pc + nbb] = iwork[pc + nbb + 1];
+	--nbb;
+    }
+
+/*     Set local scaling factors - must never attain zero. */
+
+    i__1 = nbb;
+    for (l = 1; l <= i__1; ++l) {
+	i__2 = nba;
+	for (k = 1; k <= i__2; ++k) {
+	    swork[k + l * swork_dim1] = 1.;
+	}
+    }
+
+/*     Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */
+/*     This scaling is to ensure compatibility with TRSYL and may get flushed. */
+
+    buf = 1.;
+
+/*     Compute upper bounds of blocks of A and B */
+
+    awrk = nbb;
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = iwork[k];
+	k2 = iwork[k + 1];
+	i__2 = nba;
+	for (l = k; l <= i__2; ++l) {
+	    l1 = iwork[l];
+	    l2 = iwork[l + 1];
+	    if (notrna) {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[k + (awrk + l) * swork_dim1] = dlange_("I", &i__3, &
+			i__4, &a[k1 + l1 * a_dim1], lda, wnrm);
+	    } else {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[l + (awrk + k) * swork_dim1] = dlange_("1", &i__3, &
+			i__4, &a[k1 + l1 * a_dim1], lda, wnrm);
+	    }
+	}
+    }
+    bwrk = nbb + nba;
+    i__1 = nbb;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = iwork[pc + k];
+	k2 = iwork[pc + k + 1];
+	i__2 = nbb;
+	for (l = k; l <= i__2; ++l) {
+	    l1 = iwork[pc + l];
+	    l2 = iwork[pc + l + 1];
+	    if (notrnb) {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[k + (bwrk + l) * swork_dim1] = dlange_("I", &i__3, &
+			i__4, &b[k1 + l1 * b_dim1], ldb, wnrm);
+	    } else {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[l + (bwrk + k) * swork_dim1] = dlange_("1", &i__3, &
+			i__4, &b[k1 + l1 * b_dim1], ldb, wnrm);
+	    }
+	}
+    }
+
+    sgn = (doublereal) (*isgn);
+
+    if (notrna && notrnb) {
+
+/*        Solve    A*X + ISGN*X*B = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        bottom-left corner column by column by */
+
+/*         A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                  M                         L-1 */
+/*        R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */
+/*                I=K+1                       J=1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	for (k = nba; k >= 1; --k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = iwork[k];
+	    k2 = iwork[k + 1];
+	    i__1 = nbb;
+	    for (l = 1; l <= i__1; ++l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = iwork[pc + l];
+		l2 = iwork[pc + l + 1];
+
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		dtrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.) {
+		    if (scaloc == 0.) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__2);
+		    }
+		    i__2 = nbb;
+		    for (jj = 1; jj <= i__2; ++jj) {
+			i__3 = nba;
+			for (ll = 1; ll <= i__3; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__4 = myexp_(&scaloc);
+			    d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] 
+				    / pow_di(&c_b19, &i__4);
+			    swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		xnrm = dlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		for (i__ = k - 1; i__ >= 1; --i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */
+
+		    i1 = iwork[i__];
+		    i2 = iwork[i__ + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    cnrm = dlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = dlarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b19, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b19, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b19, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = l2 - 1;
+			for (jj = l1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = i2 - i1;
+			    dscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    i__4 = k2 - k1;
+		    dgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 *
+			     a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &
+			    c_b32, &c__[i1 + l1 * c_dim1], ldc);
+
+		}
+
+		i__2 = nbb;
+		for (j = l + 1; j <= i__2; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */
+
+		    j1 = iwork[pc + j];
+		    j2 = iwork[pc + j + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    cnrm = dlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = dlarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b19, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b19, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b19, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = j2 - 1;
+			for (jj = j1; jj <= i__3; ++jj) {
+			    i__4 = k2 - k1;
+			    dscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    i__5 = l2 - l1;
+		    d__1 = -sgn;
+		    dgemm_("N", "N", &i__3, &i__4, &i__5, &d__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32,
+			     &c__[k1 + j1 * c_dim1], ldc);
+		}
+	    }
+	}
+    } else if (! notrna && notrnb) {
+
+/*        Solve    A**T*X + ISGN*X*B = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        upper-left corner column by column by */
+
+/*          A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                   K-1                        L-1 */
+/*          R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */
+/*                   I=1                        J=1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	i__1 = nba;
+	for (k = 1; k <= i__1; ++k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = iwork[k];
+	    k2 = iwork[k + 1];
+	    i__2 = nbb;
+	    for (l = 1; l <= i__2; ++l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = iwork[pc + l];
+		l2 = iwork[pc + l + 1];
+
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		dtrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.) {
+		    if (scaloc == 0.) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__3);
+		    }
+		    i__3 = nbb;
+		    for (jj = 1; jj <= i__3; ++jj) {
+			i__4 = nba;
+			for (ll = 1; ll <= i__4; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__5 = myexp_(&scaloc);
+			    d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] 
+				    / pow_di(&c_b19, &i__5);
+			    swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		xnrm = dlange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__3 = nba;
+		for (i__ = k + 1; i__ <= i__3; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */
+
+		    i1 = iwork[i__];
+		    i2 = iwork[i__ + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__4 = i2 - i1;
+		    i__5 = l2 - l1;
+		    cnrm = dlange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = dlarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__4 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__4);
+			i__4 = nbb;
+			for (jj = 1; jj <= i__4; ++jj) {
+			    i__5 = nba;
+			    for (ll = 1; ll <= i__5; ++ll) {
+/* Computing MIN */
+				i__6 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b19, &i__6);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__4 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b19, &i__4);
+			i__4 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b19, &i__4);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to to C( I, L ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = k2 - k1;
+			    dscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = i2 - i1;
+			    dscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__4 = i2 - i1;
+		    i__5 = l2 - l1;
+		    i__6 = k2 - k1;
+		    dgemm_("T", "N", &i__4, &i__5, &i__6, &c_b31, &a[k1 + i1 *
+			     a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &
+			    c_b32, &c__[i1 + l1 * c_dim1], ldc);
+		}
+
+		i__3 = nbb;
+		for (j = l + 1; j <= i__3; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */
+
+		    j1 = iwork[pc + j];
+		    j2 = iwork[pc + j + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__4 = k2 - k1;
+		    i__5 = j2 - j1;
+		    cnrm = dlange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = dlarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__4 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__4);
+			i__4 = nbb;
+			for (jj = 1; jj <= i__4; ++jj) {
+			    i__5 = nba;
+			    for (ll = 1; ll <= i__5; ++ll) {
+/* Computing MIN */
+				i__6 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b19, &i__6);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__4 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b19, &i__4);
+			i__4 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b19, &i__4);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to to C( K, J ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = k2 - k1;
+			    dscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__4 = j2 - 1;
+			for (jj = j1; jj <= i__4; ++jj) {
+			    i__5 = k2 - k1;
+			    dscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__4 = k2 - k1;
+		    i__5 = j2 - j1;
+		    i__6 = l2 - l1;
+		    d__1 = -sgn;
+		    dgemm_("N", "N", &i__4, &i__5, &i__6, &d__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32,
+			     &c__[k1 + j1 * c_dim1], ldc);
+		}
+	    }
+	}
+    } else if (! notrna && ! notrnb) {
+
+/*        Solve    A**T*X + ISGN*X*B**T = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        top-right corner column by column by */
+
+/*           A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                     K-1                          N */
+/*            R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */
+/*                     I=1                        J=L+1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	i__1 = nba;
+	for (k = 1; k <= i__1; ++k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = iwork[k];
+	    k2 = iwork[k + 1];
+	    for (l = nbb; l >= 1; --l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = iwork[pc + l];
+		l2 = iwork[pc + l + 1];
+
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		dtrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		if (scaloc * swork[k + l * swork_dim1] == 0.) {
+		    if (scaloc == 0.) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__2);
+		    }
+		    i__2 = nbb;
+		    for (jj = 1; jj <= i__2; ++jj) {
+			i__3 = nba;
+			for (ll = 1; ll <= i__3; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__4 = myexp_(&scaloc);
+			    d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] 
+				    / pow_di(&c_b19, &i__4);
+			    swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			}
+		    }
+		}
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		xnrm = dlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__2 = nba;
+		for (i__ = k + 1; i__ <= i__2; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */
+
+		    i1 = iwork[i__];
+		    i2 = iwork[i__ + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = i2 - i1;
+		    i__4 = l2 - l1;
+		    cnrm = dlange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = dlarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b19, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b19, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b19, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = i2 - i1;
+			    dscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__3 = i2 - i1;
+		    i__4 = l2 - l1;
+		    i__5 = k2 - k1;
+		    dgemm_("T", "N", &i__3, &i__4, &i__5, &c_b31, &a[k1 + i1 *
+			     a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &
+			    c_b32, &c__[i1 + l1 * c_dim1], ldc);
+		}
+
+		i__2 = l - 1;
+		for (j = 1; j <= i__2; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */
+
+		    j1 = iwork[pc + j];
+		    j2 = iwork[pc + j + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    cnrm = dlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = dlarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b19, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b19, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b19, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = j2 - 1;
+			for (jj = j1; jj <= i__3; ++jj) {
+			    i__4 = k2 - k1;
+			    dscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    i__5 = l2 - l1;
+		    d__1 = -sgn;
+		    dgemm_("N", "T", &i__3, &i__4, &i__5, &d__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32,
+			     &c__[k1 + j1 * c_dim1], ldc);
+		}
+	    }
+	}
+    } else if (notrna && ! notrnb) {
+
+/*        Solve    A*X + ISGN*X*B**T = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        bottom-right corner column by column by */
+
+/*            A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                      M                          N */
+/*            R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */
+/*                    I=K+1                      J=L+1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	for (k = nba; k >= 1; --k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = iwork[k];
+	    k2 = iwork[k + 1];
+	    for (l = nbb; l >= 1; --l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = iwork[pc + l];
+		l2 = iwork[pc + l + 1];
+
+		i__1 = k2 - k1;
+		i__2 = l2 - l1;
+		dtrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.) {
+		    if (scaloc == 0.) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__1 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__1);
+		    }
+		    i__1 = nbb;
+		    for (jj = 1; jj <= i__1; ++jj) {
+			i__2 = nba;
+			for (ll = 1; ll <= i__2; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__3 = myexp_(&scaloc);
+			    d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] 
+				    / pow_di(&c_b19, &i__3);
+			    swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__1 = k2 - k1;
+		i__2 = l2 - l1;
+		xnrm = dlange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__1 = k - 1;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */
+
+		    i1 = iwork[i__];
+		    i2 = iwork[i__ + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    cnrm = dlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = dlarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b19, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b19, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b19, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = k2 - k1;
+			    dscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = i2 - i1;
+			    dscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    i__4 = k2 - k1;
+		    dgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 *
+			     a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &
+			    c_b32, &c__[i1 + l1 * c_dim1], ldc);
+
+		}
+
+		i__1 = l - 1;
+		for (j = 1; j <= i__1; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */
+
+		    j1 = iwork[pc + j];
+		    j2 = iwork[pc + j + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = k2 - k1;
+		    i__3 = j2 - j1;
+		    cnrm = dlange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = dlarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_di(&c_b19, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b19, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b19, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b19, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = l2 - 1;
+			for (jj = l1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = j2 - 1;
+			for (jj = j1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    dscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__2 = k2 - k1;
+		    i__3 = j2 - j1;
+		    i__4 = l2 - l1;
+		    d__1 = -sgn;
+		    dgemm_("N", "T", &i__2, &i__3, &i__4, &d__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32,
+			     &c__[k1 + j1 * c_dim1], ldc);
+		}
+	    }
+	}
+
+    }
+    free(wnrm);
+/*     Reduce local scaling factors */
+
+    *scale = swork[swork_dim1 + 1];
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	i__2 = nbb;
+	for (l = 1; l <= i__2; ++l) {
+/* Computing MIN */
+	    d__1 = *scale, d__2 = swork[k + l * swork_dim1];
+	    *scale = f2cmin(d__1,d__2);
+	}
+    }
+
+    if (*scale == 0.) {
+
+/*        The magnitude of the largest entry of the solution is larger */
+/*        than the product of BIGNUM**2 and cannot be represented in the */
+/*        form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to */
+/*        zero and give up. */
+
+	iwork[1] = nba + nbb + 2;
+	swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb);
+	swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba);
+	return 0;
+    }
+
+/*     Realize consistent scaling */
+
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = iwork[k];
+	k2 = iwork[k + 1];
+	i__2 = nbb;
+	for (l = 1; l <= i__2; ++l) {
+	    l1 = iwork[pc + l];
+	    l2 = iwork[pc + l + 1];
+	    scal = *scale / swork[k + l * swork_dim1];
+	    if (scal != 1.) {
+		i__3 = l2 - 1;
+		for (ll = l1; ll <= i__3; ++ll) {
+		    i__4 = k2 - k1;
+		    dscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1);
+		}
+	    }
+	}
+    }
+
+    if (buf != 1. && buf > 0.) {
+
+/*        Decrease SCALE as much as possible. */
+
+/* Computing MIN */
+	d__1 = *scale / smlnum, d__2 = 1. / buf;
+	scaloc = f2cmin(d__1,d__2);
+	buf *= scaloc;
+	*scale /= scaloc;
+    }
+    if (buf != 1. && buf > 0.) {
+
+/*        In case of overly aggressive scaling during the computation, */
+/*        flushing of the global scale factor may be prevented by */
+/*        undoing some of the scaling. This step is to ensure that */
+/*        this routine flushes only scale factors that TRSYL also */
+/*        flushes and be usable as a drop-in replacement. */
+
+/*        How much can the normwise largest entry be upscaled? */
+
+	scal = c__[c_dim1 + 1];
+	i__1 = *m;
+	for (k = 1; k <= i__1; ++k) {
+	    i__2 = *n;
+	    for (l = 1; l <= i__2; ++l) {
+/* Computing MAX */
+		d__2 = scal, d__3 = (d__1 = c__[k + l * c_dim1], abs(d__1));
+		scal = f2cmax(d__2,d__3);
+	    }
+	}
+
+/*        Increase BUF as close to 1 as possible and apply scaling. */
+
+/* Computing MIN */
+	d__1 = bignum / scal, d__2 = 1. / buf;
+	scaloc = f2cmin(d__1,d__2);
+	buf *= scaloc;
+	dlascl_("G", &c_n1, &c_n1, &c_b32, &scaloc, m, n, &c__[c_offset], ldc,
+		 &iwork[1]);
+    }
+
+/*     Combine with buffer scaling factor. SCALE will be flushed if */
+/*     BUF is less than one here. */
+
+    *scale *= buf;
+
+/*     Restore workspace dimensions */
+
+    iwork[1] = nba + nbb + 2;
+    swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb);
+    swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba);
+
+    return 0;
+
+/*     End of DTRSYL3 */
+
+} /* dtrsyl3_ */
+
diff --git a/lapack-netlib/SRC/strsyl3.c b/lapack-netlib/SRC/strsyl3.c
index d05923a46..85d68e017 100644
--- a/lapack-netlib/SRC/strsyl3.c
+++ b/lapack-netlib/SRC/strsyl3.c
@@ -157,6 +157,7 @@ struct Namelist {
 	};
 typedef struct Namelist Namelist;
 
+#define exponent(x) 
 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (fabs(x))
 #define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
@@ -233,7 +234,9 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define myhuge_(w) HUGE_VAL
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+#define myexp_(w) my_expfunc(w)
 
+static int my_expfunc(float* x) {int e; (void)frexpf(*x,&e); return e;}
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
@@ -379,3 +382,1561 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 	pCd(z) = zdotc;
 }
 #endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static real c_b19 = 2.f;
+static real c_b31 = -1.f;
+static real c_b32 = 1.f;
+
+/* > \brief \b STRSYL3 */
+
+/* Definition: */
+/* =========== */
+
+
+/* >  \par Purpose */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* >  STRSYL3 solves the real Sylvester matrix equation: */
+/* > */
+/* >     op(A)*X + X*op(B) = scale*C or */
+/* >     op(A)*X - X*op(B) = scale*C, */
+/* > */
+/* >  where op(A) = A or A**T, and  A and B are both upper quasi- */
+/* >  triangular. A is M-by-M and B is N-by-N; the right hand side C and */
+/* >  the solution X are M-by-N; and scale is an output scale factor, set */
+/* >  <= 1 to avoid overflow in X. */
+/* > */
+/* >  A and B must be in Schur canonical form (as returned by SHSEQR), that */
+/* >  is, block upper triangular with 1-by-1 and 2-by-2 diagonal blocks; */
+/* >  each 2-by-2 diagonal block has its diagonal elements equal and its */
+/* >  off-diagonal elements of opposite sign. */
+/* > */
+/* >  This is the block version of the algorithm. */
+/* > \endverbatim */
+
+/*  Arguments */
+/*  ========= */
+
+/* > \param[in] TRANA */
+/* > \verbatim */
+/* >          TRANA is CHARACTER*1 */
+/* >          Specifies the option op(A): */
+/* >          = 'N': op(A) = A    (No transpose) */
+/* >          = 'T': op(A) = A**T (Transpose) */
+/* >          = 'C': op(A) = A**H (Conjugate transpose = Transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TRANB */
+/* > \verbatim */
+/* >          TRANB is CHARACTER*1 */
+/* >          Specifies the option op(B): */
+/* >          = 'N': op(B) = B    (No transpose) */
+/* >          = 'T': op(B) = B**T (Transpose) */
+/* >          = 'C': op(B) = B**H (Conjugate transpose = Transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] ISGN */
+/* > \verbatim */
+/* >          ISGN is INTEGER */
+/* >          Specifies the sign in the equation: */
+/* >          = +1: solve op(A)*X + X*op(B) = scale*C */
+/* >          = -1: solve op(A)*X - X*op(B) = scale*C */
+/* > \endverbatim */
+/* > */
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The order of the matrix A, and the number of rows in the */
+/* >          matrices X and C. M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The order of the matrix B, and the number of columns in the */
+/* >          matrices X and C. N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is REAL array, dimension (LDA,M) */
+/* >          The upper quasi-triangular matrix A, in Schur canonical form. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A. LDA >= f2cmax(1,M). */
+/* > \endverbatim */
+/* > */
+/* > \param[in] B */
+/* > \verbatim */
+/* >          B is REAL array, dimension (LDB,N) */
+/* >          The upper quasi-triangular matrix B, in Schur canonical form. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= f2cmax(1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] C */
+/* > \verbatim */
+/* >          C is REAL array, dimension (LDC,N) */
+/* >          On entry, the M-by-N right hand side matrix C. */
+/* >          On exit, C is overwritten by the solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDC */
+/* > \verbatim */
+/* >          LDC is INTEGER */
+/* >          The leading dimension of the array C. LDC >= f2cmax(1,M) */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SCALE */
+/* > \verbatim */
+/* >          SCALE is REAL */
+/* >          The scale factor, scale, set <= 1 to avoid overflow in X. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] IWORK */
+/* > \verbatim */
+/* >          IWORK is INTEGER array, dimension (MAX(1,LIWORK)) */
+/* >          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LIWORK */
+/* > \verbatim */
+/* >          IWORK is INTEGER */
+/* >          The dimension of the array IWORK. LIWORK >=  ((M + NB - 1) / NB + 1) */
+/* >          + ((N + NB - 1) / NB + 1), where NB is the optimal block size. */
+/* > */
+/* >          If LIWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal dimension of the IWORK array, */
+/* >          returns this value as the first entry of the IWORK array, and */
+/* >          no error message related to LIWORK is issued by XERBLA. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SWORK */
+/* > \verbatim */
+/* >          SWORK is REAL array, dimension (MAX(2, ROWS), */
+/* >          MAX(1,COLS)). */
+/* >          On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */
+/* >          and SWORK(2) returns the optimal COLS. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDSWORK */
+/* > \verbatim */
+/* >          LDSWORK is INTEGER */
+/* >          LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */
+/* >          and NB is the optimal block size. */
+/* > */
+/* >          If LDSWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal dimensions of the SWORK matrix, */
+/* >          returns these values as the first and second entry of the SWORK */
+/* >          matrix, and no error message related LWORK is issued by XERBLA. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* >          = 1: A and B have common or very close eigenvalues; perturbed */
+/* >               values were used to solve the equation (but the matrices */
+/* >               A and B are unchanged). */
+/* > \endverbatim */
+
+/*  ===================================================================== */
+/*  References: */
+/*   E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */
+/*   algorithms: The triangular Sylvester equation, ACM Transactions */
+/*   on Mathematical Software (TOMS), volume 29, pages 218--243. */
+
+/*   A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */
+/*   Solution of the Triangular Sylvester Equation. Lecture Notes in */
+/*   Computer Science, vol 12043, pages 82--92, Springer. */
+
+/*  Contributor: */
+/*   Angelika Schwarz, Umea University, Sweden. */
+
+/*  ===================================================================== */
+/* Subroutine */ int strsyl3_(char *trana, char *tranb, integer *isgn, 
+	integer *m, integer *n, real *a, integer *lda, real *b, integer *ldb, 
+	real *c__, integer *ldc, real *scale, integer *iwork, integer *liwork,
+	 real *swork, integer *ldswork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, 
+	    swork_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    real r__1, r__2, r__3;
+
+    /* Local variables */
+    real scal, anrm, bnrm, cnrm;
+    integer awrk, bwrk;
+    logical skip;
+    real *wnrm, xnrm;
+    integer i__, j, k, l;
+    extern logical lsame_(char *, char *);
+    integer iinfo;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), 
+	    sgemm_(char *, char *, integer *, integer *, integer *, real *, 
+	    real *, integer *, real *, integer *, real *, real *, integer *);
+    integer i1, i2, j1, j2, k1, k2, l1;
+//    extern integer myexp_(real *);
+    integer l2, nb, pc, jj, ll;
+    real scaloc;
+    extern real slamch_(char *), slange_(char *, integer *, integer *,
+	     real *, integer *, real *);
+    real scamin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    real bignum;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, 
+	    real *, integer *, integer *, real *, integer *, integer *);
+    extern real slarmm_(real *, real *, real *);
+    logical notrna, notrnb;
+    real smlnum;
+    logical lquery;
+    extern /* Subroutine */ int strsyl_(char *, char *, integer *, integer *, 
+	    integer *, real *, integer *, real *, integer *, real *, integer *
+	    , real *, integer *);
+    integer nba, nbb;
+    real buf, sgn;
+
+/*     Decode and Test input parameters */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --iwork;
+    swork_dim1 = *ldswork;
+    swork_offset = 1 + swork_dim1 * 1;
+    swork -= swork_offset;
+
+    /* Function Body */
+    notrna = lsame_(trana, "N");
+    notrnb = lsame_(tranb, "N");
+
+/*     Use the same block size for all matrices. */
+
+/* Computing MAX */
+    i__1 = 8, i__2 = ilaenv_(&c__1, "STRSYL", "", m, n, &c_n1, &c_n1, (ftnlen)
+	    6, (ftnlen)0);
+    nb = f2cmax(i__1,i__2);
+
+/*     Compute number of blocks in A and B */
+
+/* Computing MAX */
+    i__1 = 1, i__2 = (*m + nb - 1) / nb;
+    nba = f2cmax(i__1,i__2);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*n + nb - 1) / nb;
+    nbb = f2cmax(i__1,i__2);
+
+/*     Compute workspace */
+
+    *info = 0;
+    lquery = *liwork == -1 || *ldswork == -1;
+    iwork[1] = nba + nbb + 2;
+    if (lquery) {
+	*ldswork = 2;
+	swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb);
+	swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba);
+    }
+
+/*     Test the input arguments */
+
+    if (! notrna && ! lsame_(trana, "T") && ! lsame_(
+	    trana, "C")) {
+	*info = -1;
+    } else if (! notrnb && ! lsame_(tranb, "T") && ! 
+	    lsame_(tranb, "C")) {
+	*info = -2;
+    } else if (*isgn != 1 && *isgn != -1) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -7;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -9;
+    } else if (*ldc < f2cmax(1,*m)) {
+	*info = -11;
+    } else if (! lquery && *liwork < iwork[1]) {
+	*info = -14;
+    } else if (! lquery && *ldswork < f2cmax(nba,nbb)) {
+	*info = -16;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("STRSYL3", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    *scale = 1.f;
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+/*     Use unblocked code for small problems or if insufficient */
+/*     workspaces are provided */
+
+    if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb) || *liwork < iwork[1]) {
+	strsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], 
+		ldb, &c__[c_offset], ldc, scale, info);
+	return 0;
+    }
+
+
+/*      REAL               WNRM( MAX( M, N ) ) */
+    wnrm=(real*)malloc (f2cmax(*m,*n)*sizeof(real));
+
+/*     Set constants to control overflow */
+
+    smlnum = slamch_("S");
+    bignum = 1.f / smlnum;
+
+/*      Partition A such that 2-by-2 blocks on the diagonal are not split */
+
+    skip = FALSE_;
+    i__1 = nba;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	iwork[i__] = (i__ - 1) * nb + 1;
+    }
+    iwork[nba + 1] = *m + 1;
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	l1 = iwork[k];
+	l2 = iwork[k + 1] - 1;
+	i__2 = l2;
+	for (l = l1; l <= i__2; ++l) {
+	    if (skip) {
+		skip = FALSE_;
+		mycycle_();
+	    }
+	    if (l >= *m) {
+/*               A( M, M ) is a 1-by-1 block */
+		mycycle_();
+	    }
+	    if (a[l + (l + 1) * a_dim1] != 0.f && a[l + 1 + l * a_dim1] != 
+		    0.f) {
+/*               Check if 2-by-2 block is split */
+		if (l + 1 == iwork[k + 1]) {
+		    ++iwork[k + 1];
+		    mycycle_();
+		}
+		skip = TRUE_;
+	    }
+	}
+    }
+    iwork[nba + 1] = *m + 1;
+    if (iwork[nba] >= iwork[nba + 1]) {
+	iwork[nba] = iwork[nba + 1];
+	--nba;
+    }
+
+/*      Partition B such that 2-by-2 blocks on the diagonal are not split */
+
+    pc = nba + 1;
+    skip = FALSE_;
+    i__1 = nbb;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	iwork[pc + i__] = (i__ - 1) * nb + 1;
+    }
+    iwork[pc + nbb + 1] = *n + 1;
+    i__1 = nbb;
+    for (k = 1; k <= i__1; ++k) {
+	l1 = iwork[pc + k];
+	l2 = iwork[pc + k + 1] - 1;
+	i__2 = l2;
+	for (l = l1; l <= i__2; ++l) {
+	    if (skip) {
+		skip = FALSE_;
+		mycycle_();
+	    }
+	    if (l >= *n) {
+/*               B( N, N ) is a 1-by-1 block */
+		mycycle_();
+	    }
+	    if (b[l + (l + 1) * b_dim1] != 0.f && b[l + 1 + l * b_dim1] != 
+		    0.f) {
+/*               Check if 2-by-2 block is split */
+		if (l + 1 == iwork[pc + k + 1]) {
+		    ++iwork[pc + k + 1];
+		    mycycle_();
+		}
+		skip = TRUE_;
+	    }
+	}
+    }
+    iwork[pc + nbb + 1] = *n + 1;
+    if (iwork[pc + nbb] >= iwork[pc + nbb + 1]) {
+	iwork[pc + nbb] = iwork[pc + nbb + 1];
+	--nbb;
+    }
+
+/*     Set local scaling factors - must never attain zero. */
+
+    i__1 = nbb;
+    for (l = 1; l <= i__1; ++l) {
+	i__2 = nba;
+	for (k = 1; k <= i__2; ++k) {
+	    swork[k + l * swork_dim1] = 1.f;
+	}
+    }
+
+/*     Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */
+/*     This scaling is to ensure compatibility with TRSYL and may get flushed. */
+
+    buf = 1.f;
+
+/*     Compute upper bounds of blocks of A and B */
+
+    awrk = nbb;
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = iwork[k];
+	k2 = iwork[k + 1];
+	i__2 = nba;
+	for (l = k; l <= i__2; ++l) {
+	    l1 = iwork[l];
+	    l2 = iwork[l + 1];
+	    if (notrna) {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[k + (awrk + l) * swork_dim1] = slange_("I", &i__3, &
+			i__4, &a[k1 + l1 * a_dim1], lda, wnrm);
+	    } else {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[l + (awrk + k) * swork_dim1] = slange_("1", &i__3, &
+			i__4, &a[k1 + l1 * a_dim1], lda, wnrm);
+	    }
+	}
+    }
+    bwrk = nbb + nba;
+    i__1 = nbb;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = iwork[pc + k];
+	k2 = iwork[pc + k + 1];
+	i__2 = nbb;
+	for (l = k; l <= i__2; ++l) {
+	    l1 = iwork[pc + l];
+	    l2 = iwork[pc + l + 1];
+	    if (notrnb) {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[k + (bwrk + l) * swork_dim1] = slange_("I", &i__3, &
+			i__4, &b[k1 + l1 * b_dim1], ldb, wnrm);
+	    } else {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[l + (bwrk + k) * swork_dim1] = slange_("1", &i__3, &
+			i__4, &b[k1 + l1 * b_dim1], ldb, wnrm);
+	    }
+	}
+    }
+
+    sgn = (real) (*isgn);
+
+    if (notrna && notrnb) {
+
+/*        Solve    A*X + ISGN*X*B = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        bottom-left corner column by column by */
+
+/*         A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                  M                         L-1 */
+/*        R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */
+/*                I=K+1                       J=1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	for (k = nba; k >= 1; --k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = iwork[k];
+	    k2 = iwork[k + 1];
+	    i__1 = nbb;
+	    for (l = 1; l <= i__1; ++l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = iwork[pc + l];
+		l2 = iwork[pc + l + 1];
+
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		strsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.f) {
+		    if (scaloc == 0.f) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.f;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__2);
+		    }
+		    i__2 = nbb;
+		    for (jj = 1; jj <= i__2; ++jj) {
+			i__3 = nba;
+			for (ll = 1; ll <= i__3; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__4 = myexp_(&scaloc);
+			    r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] 
+				    / pow_ri(&c_b19, &i__4);
+			    swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		xnrm = slange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		for (i__ = k - 1; i__ >= 1; --i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */
+
+		    i1 = iwork[i__];
+		    i2 = iwork[i__ + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    cnrm = slange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = slarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b19, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b19, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b19, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = l2 - 1;
+			for (jj = l1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = i2 - i1;
+			    sscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    i__4 = k2 - k1;
+		    sgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 *
+			     a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &
+			    c_b32, &c__[i1 + l1 * c_dim1], ldc);
+
+		}
+
+		i__2 = nbb;
+		for (j = l + 1; j <= i__2; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */
+
+		    j1 = iwork[pc + j];
+		    j2 = iwork[pc + j + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    cnrm = slange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = slarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b19, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b19, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b19, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = j2 - 1;
+			for (jj = j1; jj <= i__3; ++jj) {
+			    i__4 = k2 - k1;
+			    sscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    i__5 = l2 - l1;
+		    r__1 = -sgn;
+		    sgemm_("N", "N", &i__3, &i__4, &i__5, &r__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32,
+			     &c__[k1 + j1 * c_dim1], ldc);
+		}
+	    }
+	}
+    } else if (! notrna && notrnb) {
+
+/*        Solve    A**T*X + ISGN*X*B = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        upper-left corner column by column by */
+
+/*          A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                   K-1                        L-1 */
+/*          R(K,L) = SUM [A(I,K)**T*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */
+/*                   I=1                        J=1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	i__1 = nba;
+	for (k = 1; k <= i__1; ++k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = iwork[k];
+	    k2 = iwork[k + 1];
+	    i__2 = nbb;
+	    for (l = 1; l <= i__2; ++l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = iwork[pc + l];
+		l2 = iwork[pc + l + 1];
+
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		strsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.f) {
+		    if (scaloc == 0.f) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.f;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__3);
+		    }
+		    i__3 = nbb;
+		    for (jj = 1; jj <= i__3; ++jj) {
+			i__4 = nba;
+			for (ll = 1; ll <= i__4; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__5 = myexp_(&scaloc);
+			    r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] 
+				    / pow_ri(&c_b19, &i__5);
+			    swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		xnrm = slange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__3 = nba;
+		for (i__ = k + 1; i__ <= i__3; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */
+
+		    i1 = iwork[i__];
+		    i2 = iwork[i__ + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__4 = i2 - i1;
+		    i__5 = l2 - l1;
+		    cnrm = slange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = slarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__4 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__4);
+			i__4 = nbb;
+			for (jj = 1; jj <= i__4; ++jj) {
+			    i__5 = nba;
+			    for (ll = 1; ll <= i__5; ++ll) {
+/* Computing MIN */
+				i__6 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b19, &i__6);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__4 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b19, &i__4);
+			i__4 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b19, &i__4);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to to C( I, L ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = k2 - k1;
+			    sscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = i2 - i1;
+			    sscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__4 = i2 - i1;
+		    i__5 = l2 - l1;
+		    i__6 = k2 - k1;
+		    sgemm_("T", "N", &i__4, &i__5, &i__6, &c_b31, &a[k1 + i1 *
+			     a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &
+			    c_b32, &c__[i1 + l1 * c_dim1], ldc);
+		}
+
+		i__3 = nbb;
+		for (j = l + 1; j <= i__3; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */
+
+		    j1 = iwork[pc + j];
+		    j2 = iwork[pc + j + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__4 = k2 - k1;
+		    i__5 = j2 - j1;
+		    cnrm = slange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = slarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__4 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__4);
+			i__4 = nbb;
+			for (jj = 1; jj <= i__4; ++jj) {
+			    i__5 = nba;
+			    for (ll = 1; ll <= i__5; ++ll) {
+/* Computing MIN */
+				i__6 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b19, &i__6);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__4 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b19, &i__4);
+			i__4 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b19, &i__4);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to to C( K, J ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = k2 - k1;
+			    sscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__4 = j2 - 1;
+			for (jj = j1; jj <= i__4; ++jj) {
+			    i__5 = k2 - k1;
+			    sscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__4 = k2 - k1;
+		    i__5 = j2 - j1;
+		    i__6 = l2 - l1;
+		    r__1 = -sgn;
+		    sgemm_("N", "N", &i__4, &i__5, &i__6, &r__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b32,
+			     &c__[k1 + j1 * c_dim1], ldc);
+		}
+	    }
+	}
+    } else if (! notrna && ! notrnb) {
+
+/*        Solve    A**T*X + ISGN*X*B**T = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        top-right corner column by column by */
+
+/*           A(K,K)**T*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                     K-1                          N */
+/*            R(K,L) = SUM [A(I,K)**T*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */
+/*                     I=1                        J=L+1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	i__1 = nba;
+	for (k = 1; k <= i__1; ++k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = iwork[k];
+	    k2 = iwork[k + 1];
+	    for (l = nbb; l >= 1; --l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = iwork[pc + l];
+		l2 = iwork[pc + l + 1];
+
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		strsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.f) {
+		    if (scaloc == 0.f) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.f;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__2);
+		    }
+		    i__2 = nbb;
+		    for (jj = 1; jj <= i__2; ++jj) {
+			i__3 = nba;
+			for (ll = 1; ll <= i__3; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__4 = myexp_(&scaloc);
+			    r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] 
+				    / pow_ri(&c_b19, &i__4);
+			    swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		xnrm = slange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__2 = nba;
+		for (i__ = k + 1; i__ <= i__2; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( K, I )**T * C( K, L ) */
+
+		    i1 = iwork[i__];
+		    i2 = iwork[i__ + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = i2 - i1;
+		    i__4 = l2 - l1;
+		    cnrm = slange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = slarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b19, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b19, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b19, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = i2 - i1;
+			    sscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__3 = i2 - i1;
+		    i__4 = l2 - l1;
+		    i__5 = k2 - k1;
+		    sgemm_("T", "N", &i__3, &i__4, &i__5, &c_b31, &a[k1 + i1 *
+			     a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &
+			    c_b32, &c__[i1 + l1 * c_dim1], ldc);
+		}
+
+		i__2 = l - 1;
+		for (j = 1; j <= i__2; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */
+
+		    j1 = iwork[pc + j];
+		    j2 = iwork[pc + j + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    cnrm = slange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = slarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b19, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b19, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b19, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__3 = j2 - 1;
+			for (jj = j1; jj <= i__3; ++jj) {
+			    i__4 = k2 - k1;
+			    sscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    i__5 = l2 - l1;
+		    r__1 = -sgn;
+		    sgemm_("N", "T", &i__3, &i__4, &i__5, &r__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32,
+			     &c__[k1 + j1 * c_dim1], ldc);
+		}
+	    }
+	}
+    } else if (notrna && ! notrnb) {
+
+/*        Solve    A*X + ISGN*X*B**T = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        bottom-right corner column by column by */
+
+/*            A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**T = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                      M                          N */
+/*            R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**T]. */
+/*                    I=K+1                      J=L+1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	for (k = nba; k >= 1; --k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = iwork[k];
+	    k2 = iwork[k + 1];
+	    for (l = nbb; l >= 1; --l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = iwork[pc + l];
+		l2 = iwork[pc + l + 1];
+
+		i__1 = k2 - k1;
+		i__2 = l2 - l1;
+		strsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.f) {
+		    if (scaloc == 0.f) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.f;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__1 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__1);
+		    }
+		    i__1 = nbb;
+		    for (jj = 1; jj <= i__1; ++jj) {
+			i__2 = nba;
+			for (ll = 1; ll <= i__2; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__3 = myexp_(&scaloc);
+			    r__1 = bignum, r__2 = swork[ll + jj * swork_dim1] 
+				    / pow_ri(&c_b19, &i__3);
+			    swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__1 = k2 - k1;
+		i__2 = l2 - l1;
+		xnrm = slange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__1 = k - 1;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */
+
+		    i1 = iwork[i__];
+		    i2 = iwork[i__ + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    cnrm = slange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[i__ + l * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = slarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b19, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b19, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b19, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = k2 - k1;
+			    sscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = i2 - i1;
+			    sscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    i__4 = k2 - k1;
+		    sgemm_("N", "N", &i__2, &i__3, &i__4, &c_b31, &a[i1 + k1 *
+			     a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &
+			    c_b32, &c__[i1 + l1 * c_dim1], ldc);
+
+		}
+
+		i__1 = l - 1;
+		for (j = 1; j <= i__1; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**T */
+
+		    j1 = iwork[pc + j];
+		    j2 = iwork[pc + j + 1];
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = k2 - k1;
+		    i__3 = j2 - j1;
+		    cnrm = slange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    r__1 = swork[k + j * swork_dim1], r__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(r__1,r__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = slarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.f) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_ri(&c_b19, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				r__1 = bignum, r__2 = swork[ll + jj * 
+					swork_dim1] / pow_ri(&c_b19, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(r__1,r__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_ri(&c_b19, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_ri(&c_b19, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = l2 - 1;
+			for (jj = l1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.f) {
+			i__2 = j2 - 1;
+			for (jj = j1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    sscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__2 = k2 - k1;
+		    i__3 = j2 - j1;
+		    i__4 = l2 - l1;
+		    r__1 = -sgn;
+		    sgemm_("N", "T", &i__2, &i__3, &i__4, &r__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b32,
+			     &c__[k1 + j1 * c_dim1], ldc);
+		}
+	    }
+	}
+
+    }
+
+    free(wnrm);
+/*     Reduce local scaling factors */
+
+    *scale = swork[swork_dim1 + 1];
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	i__2 = nbb;
+	for (l = 1; l <= i__2; ++l) {
+/* Computing MIN */
+	    r__1 = *scale, r__2 = swork[k + l * swork_dim1];
+	    *scale = f2cmin(r__1,r__2);
+	}
+    }
+
+    if (*scale == 0.f) {
+
+/*        The magnitude of the largest entry of the solution is larger */
+/*        than the product of BIGNUM**2 and cannot be represented in the */
+/*        form (1/SCALE)*X if SCALE is REAL. Set SCALE to zero and give up. */
+
+	iwork[1] = nba + nbb + 2;
+	swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb);
+	swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba);
+	return 0;
+    }
+
+/*     Realize consistent scaling */
+
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = iwork[k];
+	k2 = iwork[k + 1];
+	i__2 = nbb;
+	for (l = 1; l <= i__2; ++l) {
+	    l1 = iwork[pc + l];
+	    l2 = iwork[pc + l + 1];
+	    scal = *scale / swork[k + l * swork_dim1];
+	    if (scal != 1.f) {
+		i__3 = l2 - 1;
+		for (ll = l1; ll <= i__3; ++ll) {
+		    i__4 = k2 - k1;
+		    sscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1);
+		}
+	    }
+	}
+    }
+
+    if (buf != 1.f && buf > 0.f) {
+
+/*        Decrease SCALE as much as possible. */
+
+/* Computing MIN */
+	r__1 = *scale / smlnum, r__2 = 1.f / buf;
+	scaloc = f2cmin(r__1,r__2);
+	buf *= scaloc;
+	*scale /= scaloc;
+    }
+    if (buf != 1.f && buf > 0.f) {
+
+/*        In case of overly aggressive scaling during the computation, */
+/*        flushing of the global scale factor may be prevented by */
+/*        undoing some of the scaling. This step is to ensure that */
+/*        this routine flushes only scale factors that TRSYL also */
+/*        flushes and be usable as a drop-in replacement. */
+
+/*        How much can the normwise largest entry be upscaled? */
+
+	scal = c__[c_dim1 + 1];
+	i__1 = *m;
+	for (k = 1; k <= i__1; ++k) {
+	    i__2 = *n;
+	    for (l = 1; l <= i__2; ++l) {
+/* Computing MAX */
+		r__2 = scal, r__3 = (r__1 = c__[k + l * c_dim1], abs(r__1));
+		scal = f2cmax(r__2,r__3);
+	    }
+	}
+
+/*        Increase BUF as close to 1 as possible and apply scaling. */
+
+/* Computing MIN */
+	r__1 = bignum / scal, r__2 = 1.f / buf;
+	scaloc = f2cmin(r__1,r__2);
+	buf *= scaloc;
+	slascl_("G", &c_n1, &c_n1, &c_b32, &scaloc, m, n, &c__[c_offset], ldc,
+		 &iwork[1]);
+    }
+
+/*     Combine with buffer scaling factor. SCALE will be flushed if */
+/*     BUF is less than one here. */
+
+    *scale *= buf;
+
+/*     Restore workspace dimensions */
+
+    iwork[1] = nba + nbb + 2;
+    swork[swork_dim1 + 1] = (real) f2cmax(nba,nbb);
+    swork[swork_dim1 + 2] = (real) ((nbb << 1) + nba);
+
+    return 0;
+
+/*     End of STRSYL3 */
+
+} /* strsyl3_ */
+
diff --git a/lapack-netlib/SRC/ztrsyl3.c b/lapack-netlib/SRC/ztrsyl3.c
index d05923a46..c1be7d589 100644
--- a/lapack-netlib/SRC/ztrsyl3.c
+++ b/lapack-netlib/SRC/ztrsyl3.c
@@ -157,6 +157,7 @@ struct Namelist {
 	};
 typedef struct Namelist Namelist;
 
+#define exponent(x) 
 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (fabs(x))
 #define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
@@ -233,7 +234,9 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define myhuge_(w) HUGE_VAL
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+#define myexp_(w) my_expfunc(w)
 
+static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;}
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
@@ -379,3 +382,1519 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 	pCd(z) = zdotc;
 }
 #endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static doublereal c_b18 = 2.;
+static doublereal c_b106 = 1.;
+
+/* > \brief \b ZTRSYL3 */
+
+/* Definition: */
+/* =========== */
+
+
+/* >  \par Purpose */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* >  ZTRSYL3 solves the complex Sylvester matrix equation: */
+/* > */
+/* >     op(A)*X + X*op(B) = scale*C or */
+/* >     op(A)*X - X*op(B) = scale*C, */
+/* > */
+/* >  where op(A) = A or A**H, and  A and B are both upper triangular. A is */
+/* >  M-by-M and B is N-by-N; the right hand side C and the solution X are */
+/* >  M-by-N; and scale is an output scale factor, set <= 1 to avoid */
+/* >  overflow in X. */
+/* > */
+/* >  This is the block version of the algorithm. */
+/* > \endverbatim */
+
+/*  Arguments */
+/*  ========= */
+
+/* > \param[in] TRANA */
+/* > \verbatim */
+/* >          TRANA is CHARACTER*1 */
+/* >          Specifies the option op(A): */
+/* >          = 'N': op(A) = A    (No transpose) */
+/* >          = 'C': op(A) = A**H (Conjugate transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TRANB */
+/* > \verbatim */
+/* >          TRANB is CHARACTER*1 */
+/* >          Specifies the option op(B): */
+/* >          = 'N': op(B) = B    (No transpose) */
+/* >          = 'C': op(B) = B**H (Conjugate transpose) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] ISGN */
+/* > \verbatim */
+/* >          ISGN is INTEGER */
+/* >          Specifies the sign in the equation: */
+/* >          = +1: solve op(A)*X + X*op(B) = scale*C */
+/* >          = -1: solve op(A)*X - X*op(B) = scale*C */
+/* > \endverbatim */
+/* > */
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The order of the matrix A, and the number of rows in the */
+/* >          matrices X and C. M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The order of the matrix B, and the number of columns in the */
+/* >          matrices X and C. N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 array, dimension (LDA,M) */
+/* >          The upper triangular matrix A. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A. LDA >= f2cmax(1,M). */
+/* > \endverbatim */
+/* > */
+/* > \param[in] B */
+/* > \verbatim */
+/* >          B is COMPLEX*16 array, dimension (LDB,N) */
+/* >          The upper triangular matrix B. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= f2cmax(1,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] C */
+/* > \verbatim */
+/* >          C is COMPLEX*16 array, dimension (LDC,N) */
+/* >          On entry, the M-by-N right hand side matrix C. */
+/* >          On exit, C is overwritten by the solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDC */
+/* > \verbatim */
+/* >          LDC is INTEGER */
+/* >          The leading dimension of the array C. LDC >= f2cmax(1,M) */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SCALE */
+/* > \verbatim */
+/* >          SCALE is DOUBLE PRECISION */
+/* >          The scale factor, scale, set <= 1 to avoid overflow in X. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] SWORK */
+/* > \verbatim */
+/* >          SWORK is DOUBLE PRECISION array, dimension (MAX(2, ROWS), */
+/* >          MAX(1,COLS)). */
+/* >          On exit, if INFO = 0, SWORK(1) returns the optimal value ROWS */
+/* >          and SWORK(2) returns the optimal COLS. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDSWORK */
+/* > \verbatim */
+/* >          LDSWORK is INTEGER */
+/* >          LDSWORK >= MAX(2,ROWS), where ROWS = ((M + NB - 1) / NB + 1) */
+/* >          and NB is the optimal block size. */
+/* > */
+/* >          If LDSWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal dimensions of the SWORK matrix, */
+/* >          returns these values as the first and second entry of the SWORK */
+/* >          matrix, and no error message related LWORK is issued by XERBLA. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* >          = 1: A and B have common or very close eigenvalues; perturbed */
+/* >               values were used to solve the equation (but the matrices */
+/* >               A and B are unchanged). */
+/* > \endverbatim */
+
+/* > \ingroup complex16SYcomputational */
+
+/*  ===================================================================== */
+/*  References: */
+/*   E. S. Quintana-Orti and R. A. Van De Geijn (2003). Formal derivation of */
+/*   algorithms: The triangular Sylvester equation, ACM Transactions */
+/*   on Mathematical Software (TOMS), volume 29, pages 218--243. */
+
+/*   A. Schwarz and C. C. Kjelgaard Mikkelsen (2020). Robust Task-Parallel */
+/*   Solution of the Triangular Sylvester Equation. Lecture Notes in */
+/*   Computer Science, vol 12043, pages 82--92, Springer. */
+
+/*  Contributor: */
+/*   Angelika Schwarz, Umea University, Sweden. */
+
+/*  ===================================================================== */
+/* Subroutine */ int ztrsyl3_(char *trana, char *tranb, integer *isgn, 
+	integer *m, integer *n, doublecomplex *a, integer *lda, doublecomplex 
+	*b, integer *ldb, doublecomplex *c__, integer *ldc, doublereal *scale,
+	 doublereal *swork, integer *ldswork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, swork_dim1, 
+	    swork_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    doublereal d__1, d__2, d__3, d__4;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal scal;
+    doublecomplex csgn;
+    doublereal anrm, bnrm, cnrm;
+    integer awrk, bwrk;
+    doublereal *wnrm, xnrm;
+    integer i__, j, k, l;
+    extern logical lsame_(char *, char *);
+    integer iinfo;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    integer i1, i2, j1, j2, k1, k2, l1, l2;
+//    extern integer myexp_(doublereal *);
+    integer nb, jj, ll;
+    extern doublereal dlamch_(char *);
+    doublereal scaloc, scamin;
+    extern doublereal dlarmm_(doublereal *, doublereal *, doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, 
+	    integer *, doublereal *);
+    doublereal bignum;
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *, 
+	    doublecomplex *, integer *), zlascl_(char *, integer *, integer *,
+	     doublereal *, doublereal *, integer *, integer *, doublecomplex *
+	    , integer *, integer *);
+    logical notrna, notrnb;
+    doublereal smlnum;
+    logical lquery;
+    extern /* Subroutine */ int ztrsyl_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublecomplex *, integer *, doublereal *, integer *);
+    integer nba, nbb;
+    doublereal buf, sgn;
+
+
+
+/*     Decode and Test input parameters */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    swork_dim1 = *ldswork;
+    swork_offset = 1 + swork_dim1 * 1;
+    swork -= swork_offset;
+
+    /* Function Body */
+    notrna = lsame_(trana, "N");
+    notrnb = lsame_(tranb, "N");
+
+/*     Use the same block size for all matrices. */
+
+/* Computing MAX */
+    i__1 = 8, i__2 = ilaenv_(&c__1, "ZTRSYL", "", m, n, &c_n1, &c_n1, (ftnlen)
+	    6, (ftnlen)0);
+    nb = f2cmax(i__1,i__2);
+
+/*     Compute number of blocks in A and B */
+
+/* Computing MAX */
+    i__1 = 1, i__2 = (*m + nb - 1) / nb;
+    nba = f2cmax(i__1,i__2);
+/* Computing MAX */
+    i__1 = 1, i__2 = (*n + nb - 1) / nb;
+    nbb = f2cmax(i__1,i__2);
+
+/*     Compute workspace */
+
+    *info = 0;
+    lquery = *ldswork == -1;
+    if (lquery) {
+	*ldswork = 2;
+	swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb);
+	swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba);
+    }
+
+/*     Test the input arguments */
+
+    if (! notrna && ! lsame_(trana, "C")) {
+	*info = -1;
+    } else if (! notrnb && ! lsame_(tranb, "C")) {
+	*info = -2;
+    } else if (*isgn != 1 && *isgn != -1) {
+	*info = -3;
+    } else if (*m < 0) {
+	*info = -4;
+    } else if (*n < 0) {
+	*info = -5;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -7;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -9;
+    } else if (*ldc < f2cmax(1,*m)) {
+	*info = -11;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZTRSYL3", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    *scale = 1.;
+    if (*m == 0 || *n == 0) {
+	return 0;
+    }
+
+    wnrm = (doublereal*)malloc(f2cmax(*m,*n)*sizeof(doublereal));
+/*     Use unblocked code for small problems or if insufficient */
+/*     workspace is provided */
+
+    if (f2cmin(nba,nbb) == 1 || *ldswork < f2cmax(nba,nbb)) {
+	ztrsyl_(trana, tranb, isgn, m, n, &a[a_offset], lda, &b[b_offset], 
+		ldb, &c__[c_offset], ldc, scale, info);
+	return 0;
+    }
+
+/*     Set constants to control overflow */
+
+    smlnum = dlamch_("S");
+    bignum = 1. / smlnum;
+
+/*     Set local scaling factors. */
+
+    i__1 = nbb;
+    for (l = 1; l <= i__1; ++l) {
+	i__2 = nba;
+	for (k = 1; k <= i__2; ++k) {
+	    swork[k + l * swork_dim1] = 1.;
+	}
+    }
+
+/*     Fallback scaling factor to prevent flushing of SWORK( K, L ) to zero. */
+/*     This scaling is to ensure compatibility with TRSYL and may get flushed. */
+
+    buf = 1.;
+
+/*      Compute upper bounds of blocks of A and B */
+
+    awrk = nbb;
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	i__2 = k * nb;
+	k2 = f2cmin(i__2,*m) + 1;
+	i__2 = nba;
+	for (l = k; l <= i__2; ++l) {
+	    l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+	    i__3 = l * nb;
+	    l2 = f2cmin(i__3,*m) + 1;
+	    if (notrna) {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[k + (awrk + l) * swork_dim1] = zlange_("I", &i__3, &
+			i__4, &a[k1 + l1 * a_dim1], lda, wnrm);
+	    } else {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[l + (awrk + k) * swork_dim1] = zlange_("1", &i__3, &
+			i__4, &a[k1 + l1 * a_dim1], lda, wnrm);
+	    }
+	}
+    }
+    bwrk = nbb + nba;
+    i__1 = nbb;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	i__2 = k * nb;
+	k2 = f2cmin(i__2,*n) + 1;
+	i__2 = nbb;
+	for (l = k; l <= i__2; ++l) {
+	    l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+	    i__3 = l * nb;
+	    l2 = f2cmin(i__3,*n) + 1;
+	    if (notrnb) {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[k + (bwrk + l) * swork_dim1] = zlange_("I", &i__3, &
+			i__4, &b[k1 + l1 * b_dim1], ldb, wnrm);
+	    } else {
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		swork[l + (bwrk + k) * swork_dim1] = zlange_("1", &i__3, &
+			i__4, &b[k1 + l1 * b_dim1], ldb, wnrm);
+	    }
+	}
+    }
+
+    sgn = (doublereal) (*isgn);
+    z__1.r = sgn, z__1.i = 0.;
+    csgn.r = z__1.r, csgn.i = z__1.i;
+
+    if (notrna && notrnb) {
+
+/*        Solve    A*X + ISGN*X*B = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        bottom-left corner column by column by */
+
+/*         A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                  M                         L-1 */
+/*        R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(J,L)]. */
+/*                I=K+1                       J=1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	for (k = nba; k >= 1; --k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	    i__1 = k * nb;
+	    k2 = f2cmin(i__1,*m) + 1;
+	    i__1 = nbb;
+	    for (l = 1; l <= i__1; ++l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+		i__2 = l * nb;
+		l2 = f2cmin(i__2,*n) + 1;
+
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		ztrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.) {
+		    if (scaloc == 0.) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.;
+		    } else {
+			i__2 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__2);
+		    }
+		    i__2 = nbb;
+		    for (jj = 1; jj <= i__2; ++jj) {
+			i__3 = nba;
+			for (ll = 1; ll <= i__3; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__4 = myexp_(&scaloc);
+			    d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] 
+				    / pow_di(&c_b18, &i__4);
+			    swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		xnrm = zlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		for (i__ = k - 1; i__ >= 1; --i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */
+
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__2 = i__ * nb;
+		    i2 = f2cmin(i__2,*m) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    cnrm = zlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = dlarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b18, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b18, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b18, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L ). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = l2 - 1;
+			for (jj = l1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = i2 - i1;
+			    zdscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    i__4 = k2 - k1;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("N", "N", &i__2, &i__3, &i__4, &z__1, &a[i1 + k1 * 
+			    a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, 
+			    &c__[i1 + l1 * c_dim1], ldc)
+			    ;
+
+		}
+
+		i__2 = nbb;
+		for (j = l + 1; j <= i__2; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */
+
+		    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+		    i__3 = j * nb;
+		    j2 = f2cmin(i__3,*n) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    cnrm = zlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = dlarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b18, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b18, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b18, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = j2 - 1;
+			for (jj = j1; jj <= i__3; ++jj) {
+			    i__4 = k2 - k1;
+			    zdscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    i__5 = l2 - l1;
+		    z__1.r = -csgn.r, z__1.i = -csgn.i;
+		    zgemm_("N", "N", &i__3, &i__4, &i__5, &z__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, 
+			    &c__[k1 + j1 * c_dim1], ldc)
+			    ;
+		}
+	    }
+	}
+    } else if (! notrna && notrnb) {
+
+/*        Solve    A**H *X + ISGN*X*B = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        upper-left corner column by column by */
+
+/*          A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L) = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                   K-1                        L-1 */
+/*          R(K,L) = SUM [A(I,K)**H*X(I,L)] +ISGN*SUM [X(K,J)*B(J,L)] */
+/*                   I=1                        J=1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	i__1 = nba;
+	for (k = 1; k <= i__1; ++k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	    i__2 = k * nb;
+	    k2 = f2cmin(i__2,*m) + 1;
+	    i__2 = nbb;
+	    for (l = 1; l <= i__2; ++l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+		i__3 = l * nb;
+		l2 = f2cmin(i__3,*n) + 1;
+
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		ztrsyl_(trana, tranb, isgn, &i__3, &i__4, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.) {
+		    if (scaloc == 0.) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__3);
+		    }
+		    i__3 = nbb;
+		    for (jj = 1; jj <= i__3; ++jj) {
+			i__4 = nba;
+			for (ll = 1; ll <= i__4; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__5 = myexp_(&scaloc);
+			    d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] 
+				    / pow_di(&c_b18, &i__5);
+			    swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__3 = k2 - k1;
+		i__4 = l2 - l1;
+		xnrm = zlange_("I", &i__3, &i__4, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__3 = nba;
+		for (i__ = k + 1; i__ <= i__3; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */
+
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__4 = i__ * nb;
+		    i2 = f2cmin(i__4,*m) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__4 = i2 - i1;
+		    i__5 = l2 - l1;
+		    cnrm = zlange_("I", &i__4, &i__5, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = dlarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__4 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__4);
+			i__4 = nbb;
+			for (jj = 1; jj <= i__4; ++jj) {
+			    i__5 = nba;
+			    for (ll = 1; ll <= i__5; ++ll) {
+/* Computing MIN */
+				i__6 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b18, &i__6);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__4 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b18, &i__4);
+			i__4 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b18, &i__4);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to to C( I, L ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = k2 - k1;
+			    zdscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = i2 - i1;
+			    zdscal_(&i__5, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__4 = i2 - i1;
+		    i__5 = l2 - l1;
+		    i__6 = k2 - k1;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("C", "N", &i__4, &i__5, &i__6, &z__1, &a[k1 + i1 * 
+			    a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, 
+			    &c__[i1 + l1 * c_dim1], ldc)
+			    ;
+		}
+
+		i__3 = nbb;
+		for (j = l + 1; j <= i__3; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( L, J ) */
+
+		    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+		    i__4 = j * nb;
+		    j2 = f2cmin(i__4,*n) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__4 = k2 - k1;
+		    i__5 = j2 - j1;
+		    cnrm = zlange_("I", &i__4, &i__5, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = dlarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__4 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__4);
+			i__4 = nbb;
+			for (jj = 1; jj <= i__4; ++jj) {
+			    i__5 = nba;
+			    for (ll = 1; ll <= i__5; ++ll) {
+/* Computing MIN */
+				i__6 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b18, &i__6);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__4 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b18, &i__4);
+			i__4 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b18, &i__4);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to to C( K, J ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__4 = l2 - 1;
+			for (ll = l1; ll <= i__4; ++ll) {
+			    i__5 = k2 - k1;
+			    zdscal_(&i__5, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__4 = j2 - 1;
+			for (jj = j1; jj <= i__4; ++jj) {
+			    i__5 = k2 - k1;
+			    zdscal_(&i__5, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__4 = k2 - k1;
+		    i__5 = j2 - j1;
+		    i__6 = l2 - l1;
+		    z__1.r = -csgn.r, z__1.i = -csgn.i;
+		    zgemm_("N", "N", &i__4, &i__5, &i__6, &z__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[l1 + j1 * b_dim1], ldb, &c_b1, 
+			    &c__[k1 + j1 * c_dim1], ldc)
+			    ;
+		}
+	    }
+	}
+    } else if (! notrna && ! notrnb) {
+
+/*        Solve    A**H *X + ISGN*X*B**H = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        top-right corner column by column by */
+
+/*           A(K,K)**H*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                     K-1                          N */
+/*            R(K,L) = SUM [A(I,K)**H*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */
+/*                     I=1                        J=L+1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	i__1 = nba;
+	for (k = 1; k <= i__1; ++k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	    i__2 = k * nb;
+	    k2 = f2cmin(i__2,*m) + 1;
+	    for (l = nbb; l >= 1; --l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+		i__2 = l * nb;
+		l2 = f2cmin(i__2,*n) + 1;
+
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		ztrsyl_(trana, tranb, isgn, &i__2, &i__3, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.) {
+		    if (scaloc == 0.) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__2);
+		    }
+		    i__2 = nbb;
+		    for (jj = 1; jj <= i__2; ++jj) {
+			i__3 = nba;
+			for (ll = 1; ll <= i__3; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__4 = myexp_(&scaloc);
+			    d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] 
+				    / pow_di(&c_b18, &i__4);
+			    swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__2 = k2 - k1;
+		i__3 = l2 - l1;
+		xnrm = zlange_("I", &i__2, &i__3, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__2 = nba;
+		for (i__ = k + 1; i__ <= i__2; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( K, I )**H * C( K, L ) */
+
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__3 = i__ * nb;
+		    i2 = f2cmin(i__3,*m) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = i2 - i1;
+		    i__4 = l2 - l1;
+		    cnrm = zlange_("I", &i__3, &i__4, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = dlarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b18, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b18, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b18, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = i2 - i1;
+			    zdscal_(&i__4, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__3 = i2 - i1;
+		    i__4 = l2 - l1;
+		    i__5 = k2 - k1;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("C", "N", &i__3, &i__4, &i__5, &z__1, &a[k1 + i1 * 
+			    a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, 
+			    &c__[i1 + l1 * c_dim1], ldc)
+			    ;
+		}
+
+		i__2 = l - 1;
+		for (j = 1; j <= i__2; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */
+
+		    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+		    i__3 = j * nb;
+		    j2 = f2cmin(i__3,*n) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    cnrm = zlange_("I", &i__3, &i__4, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = dlarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__3 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__3);
+			i__3 = nbb;
+			for (jj = 1; jj <= i__3; ++jj) {
+			    i__4 = nba;
+			    for (ll = 1; ll <= i__4; ++ll) {
+/* Computing MIN */
+				i__5 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b18, &i__5);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__3 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b18, &i__3);
+			i__3 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b18, &i__3);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = l2 - 1;
+			for (ll = l1; ll <= i__3; ++ll) {
+			    i__4 = k2 - k1;
+			    zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__3 = j2 - 1;
+			for (jj = j1; jj <= i__3; ++jj) {
+			    i__4 = k2 - k1;
+			    zdscal_(&i__4, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__3 = k2 - k1;
+		    i__4 = j2 - j1;
+		    i__5 = l2 - l1;
+		    z__1.r = -csgn.r, z__1.i = -csgn.i;
+		    zgemm_("N", "C", &i__3, &i__4, &i__5, &z__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, 
+			    &c__[k1 + j1 * c_dim1], ldc)
+			    ;
+		}
+	    }
+	}
+    } else if (notrna && ! notrnb) {
+
+/*        Solve    A*X + ISGN*X*B**H = scale*C. */
+
+/*        The (K,L)th block of X is determined starting from */
+/*        bottom-right corner column by column by */
+
+/*            A(K,K)*X(K,L) + ISGN*X(K,L)*B(L,L)**H = C(K,L) - R(K,L) */
+
+/*        Where */
+/*                      M                          N */
+/*            R(K,L) = SUM [A(K,I)*X(I,L)] + ISGN*SUM [X(K,J)*B(L,J)**H]. */
+/*                    I=K+1                      J=L+1 */
+
+/*        Start loop over block rows (index = K) and block columns (index = L) */
+
+	for (k = nba; k >= 1; --k) {
+
+/*           K1: row index of the first row in X( K, L ) */
+/*           K2: row index of the first row in X( K+1, L ) */
+/*           so the K2 - K1 is the column count of the block X( K, L ) */
+
+	    k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	    i__1 = k * nb;
+	    k2 = f2cmin(i__1,*m) + 1;
+	    for (l = nbb; l >= 1; --l) {
+
+/*              L1: column index of the first column in X( K, L ) */
+/*              L2: column index of the first column in X( K, L + 1) */
+/*              so that L2 - L1 is the row count of the block X( K, L ) */
+
+		l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+		i__1 = l * nb;
+		l2 = f2cmin(i__1,*n) + 1;
+
+		i__1 = k2 - k1;
+		i__2 = l2 - l1;
+		ztrsyl_(trana, tranb, isgn, &i__1, &i__2, &a[k1 + k1 * a_dim1]
+			, lda, &b[l1 + l1 * b_dim1], ldb, &c__[k1 + l1 * 
+			c_dim1], ldc, &scaloc, &iinfo);
+		*info = f2cmax(*info,iinfo);
+
+		if (scaloc * swork[k + l * swork_dim1] == 0.) {
+		    if (scaloc == 0.) {
+/*                    The magnitude of the largest entry of X(K1:K2-1, L1:L2-1) */
+/*                    is larger than the product of BIGNUM**2 and cannot be */
+/*                    represented in the form (1/SCALE)*X(K1:K2-1, L1:L2-1). */
+/*                    Mark the computation as pointless. */
+			buf = 0.;
+		    } else {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__1 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__1);
+		    }
+		    i__1 = nbb;
+		    for (jj = 1; jj <= i__1; ++jj) {
+			i__2 = nba;
+			for (ll = 1; ll <= i__2; ++ll) {
+/*                       Bound by BIGNUM to not introduce Inf. The value */
+/*                       is irrelevant; corresponding entries of the */
+/*                       solution will be flushed in consistency scaling. */
+/* Computing MIN */
+			    i__3 = myexp_(&scaloc);
+			    d__1 = bignum, d__2 = swork[ll + jj * swork_dim1] 
+				    / pow_di(&c_b18, &i__3);
+			    swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			}
+		    }
+		}
+		swork[k + l * swork_dim1] = scaloc * swork[k + l * swork_dim1]
+			;
+		i__1 = k2 - k1;
+		i__2 = l2 - l1;
+		xnrm = zlange_("I", &i__1, &i__2, &c__[k1 + l1 * c_dim1], ldc,
+			 wnrm);
+
+		i__1 = k - 1;
+		for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*                 C( I, L ) := C( I, L ) - A( I, K ) * C( K, L ) */
+
+		    i1 = (i__ - 1) * nb + 1;
+/* Computing MIN */
+		    i__2 = i__ * nb;
+		    i2 = f2cmin(i__2,*m) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    cnrm = zlange_("I", &i__2, &i__3, &c__[i1 + l1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[i__ + l * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[i__ + l * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    anrm = swork[i__ + (awrk + k) * swork_dim1];
+		    scaloc = dlarmm_(&anrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b18, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b18, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b18, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( I, L ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = k2 - k1;
+			    zdscal_(&i__3, &scal, &c__[k1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[i__ + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = l2 - 1;
+			for (ll = l1; ll <= i__2; ++ll) {
+			    i__3 = i2 - i1;
+			    zdscal_(&i__3, &scal, &c__[i1 + ll * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[i__ + l * swork_dim1] = scamin * scaloc;
+
+		    i__2 = i2 - i1;
+		    i__3 = l2 - l1;
+		    i__4 = k2 - k1;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("N", "N", &i__2, &i__3, &i__4, &z__1, &a[i1 + k1 * 
+			    a_dim1], lda, &c__[k1 + l1 * c_dim1], ldc, &c_b1, 
+			    &c__[i1 + l1 * c_dim1], ldc)
+			    ;
+
+		}
+
+		i__1 = l - 1;
+		for (j = 1; j <= i__1; ++j) {
+
+/*                 C( K, J ) := C( K, J ) - SGN * C( K, L ) * B( J, L )**H */
+
+		    j1 = (j - 1) * nb + 1;
+/* Computing MIN */
+		    i__2 = j * nb;
+		    j2 = f2cmin(i__2,*n) + 1;
+
+/*                 Compute scaling factor to survive the linear update */
+/*                 simulating consistent scaling. */
+
+		    i__2 = k2 - k1;
+		    i__3 = j2 - j1;
+		    cnrm = zlange_("I", &i__2, &i__3, &c__[k1 + j1 * c_dim1], 
+			    ldc, wnrm);
+/* Computing MIN */
+		    d__1 = swork[k + j * swork_dim1], d__2 = swork[k + l * 
+			    swork_dim1];
+		    scamin = f2cmin(d__1,d__2);
+		    cnrm *= scamin / swork[k + j * swork_dim1];
+		    xnrm *= scamin / swork[k + l * swork_dim1];
+		    bnrm = swork[l + (bwrk + j) * swork_dim1];
+		    scaloc = dlarmm_(&bnrm, &xnrm, &cnrm);
+		    if (scaloc * scamin == 0.) {
+/*                    Use second scaling factor to prevent flushing to zero. */
+			i__2 = myexp_(&scaloc);
+			buf *= pow_di(&c_b18, &i__2);
+			i__2 = nbb;
+			for (jj = 1; jj <= i__2; ++jj) {
+			    i__3 = nba;
+			    for (ll = 1; ll <= i__3; ++ll) {
+/* Computing MIN */
+				i__4 = myexp_(&scaloc);
+				d__1 = bignum, d__2 = swork[ll + jj * 
+					swork_dim1] / pow_di(&c_b18, &i__4);
+				swork[ll + jj * swork_dim1] = f2cmin(d__1,d__2);
+			    }
+			}
+			i__2 = myexp_(&scaloc);
+			scamin /= pow_di(&c_b18, &i__2);
+			i__2 = myexp_(&scaloc);
+			scaloc /= pow_di(&c_b18, &i__2);
+		    }
+		    cnrm *= scaloc;
+		    xnrm *= scaloc;
+
+/*                 Simultaneously apply the robust update factor and the */
+/*                 consistency scaling factor to C( K, J ) and C( K, L). */
+
+		    scal = scamin / swork[k + l * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = l2 - 1;
+			for (jj = l1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+		    scal = scamin / swork[k + j * swork_dim1] * scaloc;
+		    if (scal != 1.) {
+			i__2 = j2 - 1;
+			for (jj = j1; jj <= i__2; ++jj) {
+			    i__3 = k2 - k1;
+			    zdscal_(&i__3, &scal, &c__[k1 + jj * c_dim1], &
+				    c__1);
+			}
+		    }
+
+/*                 Record current scaling factor */
+
+		    swork[k + l * swork_dim1] = scamin * scaloc;
+		    swork[k + j * swork_dim1] = scamin * scaloc;
+
+		    i__2 = k2 - k1;
+		    i__3 = j2 - j1;
+		    i__4 = l2 - l1;
+		    z__1.r = -csgn.r, z__1.i = -csgn.i;
+		    zgemm_("N", "C", &i__2, &i__3, &i__4, &z__1, &c__[k1 + l1 
+			    * c_dim1], ldc, &b[j1 + l1 * b_dim1], ldb, &c_b1, 
+			    &c__[k1 + j1 * c_dim1], ldc)
+			    ;
+		}
+	    }
+	}
+
+    }
+
+    free(wnrm);
+
+/*     Reduce local scaling factors */
+
+    *scale = swork[swork_dim1 + 1];
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	i__2 = nbb;
+	for (l = 1; l <= i__2; ++l) {
+/* Computing MIN */
+	    d__1 = *scale, d__2 = swork[k + l * swork_dim1];
+	    *scale = f2cmin(d__1,d__2);
+	}
+    }
+    if (*scale == 0.) {
+
+/*        The magnitude of the largest entry of the solution is larger */
+/*        than the product of BIGNUM**2 and cannot be represented in the */
+/*        form (1/SCALE)*X if SCALE is DOUBLE PRECISION. Set SCALE to */
+/*        zero and give up. */
+
+	swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb);
+	swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba);
+	return 0;
+    }
+
+/*     Realize consistent scaling */
+
+    i__1 = nba;
+    for (k = 1; k <= i__1; ++k) {
+	k1 = (k - 1) * nb + 1;
+/* Computing MIN */
+	i__2 = k * nb;
+	k2 = f2cmin(i__2,*m) + 1;
+	i__2 = nbb;
+	for (l = 1; l <= i__2; ++l) {
+	    l1 = (l - 1) * nb + 1;
+/* Computing MIN */
+	    i__3 = l * nb;
+	    l2 = f2cmin(i__3,*n) + 1;
+	    scal = *scale / swork[k + l * swork_dim1];
+	    if (scal != 1.) {
+		i__3 = l2 - 1;
+		for (ll = l1; ll <= i__3; ++ll) {
+		    i__4 = k2 - k1;
+		    zdscal_(&i__4, &scal, &c__[k1 + ll * c_dim1], &c__1);
+		}
+	    }
+	}
+    }
+
+    if (buf != 1. && buf > 0.) {
+
+/*        Decrease SCALE as much as possible. */
+
+/* Computing MIN */
+	d__1 = *scale / smlnum, d__2 = 1. / buf;
+	scaloc = f2cmin(d__1,d__2);
+	buf *= scaloc;
+	*scale /= scaloc;
+    }
+
+    if (buf != 1. && buf > 0.) {
+
+/*        In case of overly aggressive scaling during the computation, */
+/*        flushing of the global scale factor may be prevented by */
+/*        undoing some of the scaling. This step is to ensure that */
+/*        this routine flushes only scale factors that TRSYL also */
+/*        flushes and be usable as a drop-in replacement. */
+
+/*        How much can the normwise largest entry be upscaled? */
+
+/* Computing MAX */
+	i__1 = c_dim1 + 1;
+	d__3 = (d__1 = c__[i__1].r, abs(d__1)), d__4 = (d__2 = d_imag(&c__[
+		c_dim1 + 1]), abs(d__2));
+	scal = f2cmax(d__3,d__4);
+	i__1 = *m;
+	for (k = 1; k <= i__1; ++k) {
+	    i__2 = *n;
+	    for (l = 1; l <= i__2; ++l) {
+/* Computing MAX */
+		i__3 = k + l * c_dim1;
+		d__3 = scal, d__4 = (d__1 = c__[i__3].r, abs(d__1)), d__3 = 
+			f2cmax(d__3,d__4), d__4 = (d__2 = d_imag(&c__[k + l * 
+			c_dim1]), abs(d__2));
+		scal = f2cmax(d__3,d__4);
+	    }
+	}
+
+/*        Increase BUF as close to 1 as possible and apply scaling. */
+
+/* Computing MIN */
+	d__1 = bignum / scal, d__2 = 1. / buf;
+	scaloc = f2cmin(d__1,d__2);
+	buf *= scaloc;
+	zlascl_("G", &c_n1, &c_n1, &c_b106, &scaloc, m, n, &c__[c_offset], 
+		ldc, &iinfo);
+    }
+
+/*     Combine with buffer scaling factor. SCALE will be flushed if */
+/*     BUF is less than one here. */
+
+    *scale *= buf;
+
+/*     Restore workspace dimensions */
+
+    swork[swork_dim1 + 1] = (doublereal) f2cmax(nba,nbb);
+    swork[swork_dim1 + 2] = (doublereal) ((nbb << 1) + nba);
+
+    return 0;
+
+/*     End of ZTRSYL3 */
+
+} /* ztrsyl3_ */
+

From 2a97ca615f373d7385df3becf97d1295cc3fc29e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Nov 2022 07:36:40 +0100
Subject: [PATCH 098/154] MSVC compatibility fixes

---
 lapack-netlib/SRC/clatrs3.c | 157 +++++++++++++++++++++++++++++++----
 lapack-netlib/SRC/ctrsyl3.c | 155 +++++++++++++++++++++++++++++++----
 lapack-netlib/SRC/dlarmm.c  | 157 +++++++++++++++++++++++++++++++----
 lapack-netlib/SRC/dlatrs3.c | 157 +++++++++++++++++++++++++++++++----
 lapack-netlib/SRC/dtrsyl3.c | 155 +++++++++++++++++++++++++++++++----
 lapack-netlib/SRC/slarmm.c  | 157 +++++++++++++++++++++++++++++++----
 lapack-netlib/SRC/slatrs3.c | 157 +++++++++++++++++++++++++++++++----
 lapack-netlib/SRC/strsyl3.c | 158 +++++++++++++++++++++++++++++++----
 lapack-netlib/SRC/zlatrs3.c | 156 +++++++++++++++++++++++++++++++----
 lapack-netlib/SRC/ztrsyl3.c | 159 ++++++++++++++++++++++++++++++++----
 10 files changed, 1413 insertions(+), 155 deletions(-)

diff --git a/lapack-netlib/SRC/clatrs3.c b/lapack-netlib/SRC/clatrs3.c
index 6124a7f19..f6d76cf49 100644
--- a/lapack-netlib/SRC/clatrs3.c
+++ b/lapack-netlib/SRC/clatrs3.c
@@ -1,12 +1,3 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -19,7 +10,28 @@
 #undef I
 #endif
 
-typedef int integer;
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
@@ -27,10 +39,17 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
@@ -170,8 +189,13 @@ typedef struct Namelist Namelist;
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
@@ -183,13 +207,13 @@ typedef struct Namelist Namelist;
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
-#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
-#define r_imag(z) (cimag(Cf(z)))
+#define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
@@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
-#define myceiling_(w) ceil(w)
-#define myhuge_(w) HUGE_VAL
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+#define myexp_(w) my_expfunc(w)
+
+static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;}
 
 /* procedure parameter types for -A and -C++ */
 
@@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) {
 	}
 	return pow;
 }
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) {
 	}
 	return pow;
 }
+#endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
@@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n)
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -336,8 +411,25 @@ static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -349,9 +441,26 @@ static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 		}
 	}
 	pCd(z) = zdotc;
-}	
+}
+#endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -364,8 +473,25 @@ static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -386,6 +512,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+
 /* Table of constant values */
 
 static complex c_b2 = {1.f,0.f};
diff --git a/lapack-netlib/SRC/ctrsyl3.c b/lapack-netlib/SRC/ctrsyl3.c
index 70f265a14..3c119157c 100644
--- a/lapack-netlib/SRC/ctrsyl3.c
+++ b/lapack-netlib/SRC/ctrsyl3.c
@@ -1,12 +1,3 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -19,7 +10,28 @@
 #undef I
 #endif
 
-typedef int integer;
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
@@ -27,10 +39,17 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
@@ -157,7 +176,6 @@ struct Namelist {
 	};
 typedef struct Namelist Namelist;
 
-#define exponent(x) 
 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (fabs(x))
 #define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
@@ -171,8 +189,13 @@ typedef struct Namelist Namelist;
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
@@ -184,13 +207,13 @@ typedef struct Namelist Namelist;
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
-#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
-#define r_imag(z) (cimag(Cf(z)))
+#define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
@@ -230,13 +253,14 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
-#define myceiling_(w) ceil(w)
-#define myhuge_(w) HUGE_VAL
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
 #define myexp_(w) my_expfunc(w)
 
 static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;}
+
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
@@ -270,6 +294,21 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -282,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) {
 	}
 	return pow;
 }
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -294,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) {
 	}
 	return pow;
 }
+#endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
@@ -327,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n)
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -339,8 +411,25 @@ static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -352,9 +441,26 @@ static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 		}
 	}
 	pCd(z) = zdotc;
-}	
+}
+#endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -367,8 +473,25 @@ static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
diff --git a/lapack-netlib/SRC/dlarmm.c b/lapack-netlib/SRC/dlarmm.c
index a440586d3..eec5d143a 100644
--- a/lapack-netlib/SRC/dlarmm.c
+++ b/lapack-netlib/SRC/dlarmm.c
@@ -1,12 +1,3 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -19,7 +10,28 @@
 #undef I
 #endif
 
-typedef int integer;
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
@@ -27,10 +39,17 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
@@ -170,8 +189,13 @@ typedef struct Namelist Namelist;
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
@@ -183,13 +207,13 @@ typedef struct Namelist Namelist;
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
-#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
-#define r_imag(z) (cimag(Cf(z)))
+#define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
@@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
-#define myceiling_(w) ceil(w)
-#define myhuge_(w) HUGE_VAL
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+#define myexp_(w) my_expfunc(w)
+
+static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;}
 
 /* procedure parameter types for -A and -C++ */
 
@@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) {
 	}
 	return pow;
 }
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) {
 	}
 	return pow;
 }
+#endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
@@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n)
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -336,8 +411,25 @@ static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -349,9 +441,26 @@ static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 		}
 	}
 	pCd(z) = zdotc;
-}	
+}
+#endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -364,8 +473,25 @@ static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -386,6 +512,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+
 /* > \brief \b DLARMM */
 
 /* Definition: */
diff --git a/lapack-netlib/SRC/dlatrs3.c b/lapack-netlib/SRC/dlatrs3.c
index b6e15eb12..46eca6379 100644
--- a/lapack-netlib/SRC/dlatrs3.c
+++ b/lapack-netlib/SRC/dlatrs3.c
@@ -1,12 +1,3 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -19,7 +10,28 @@
 #undef I
 #endif
 
-typedef int integer;
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
@@ -27,10 +39,17 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
@@ -170,8 +189,13 @@ typedef struct Namelist Namelist;
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
@@ -183,13 +207,13 @@ typedef struct Namelist Namelist;
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
-#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
-#define r_imag(z) (cimag(Cf(z)))
+#define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
@@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
-#define myceiling_(w) ceil(w)
-#define myhuge_(w) HUGE_VAL
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+#define myexp_(w) my_expfunc(w)
+
+static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;}
 
 /* procedure parameter types for -A and -C++ */
 
@@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) {
 	}
 	return pow;
 }
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) {
 	}
 	return pow;
 }
+#endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
@@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n)
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -336,8 +411,25 @@ static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -349,9 +441,26 @@ static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 		}
 	}
 	pCd(z) = zdotc;
-}	
+}
+#endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -364,8 +473,25 @@ static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -386,6 +512,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+
 /* Table of constant values */
 
 static integer c__1 = 1;
diff --git a/lapack-netlib/SRC/dtrsyl3.c b/lapack-netlib/SRC/dtrsyl3.c
index 199baab75..9cfbe3dab 100644
--- a/lapack-netlib/SRC/dtrsyl3.c
+++ b/lapack-netlib/SRC/dtrsyl3.c
@@ -1,12 +1,3 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -19,7 +10,28 @@
 #undef I
 #endif
 
-typedef int integer;
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
@@ -27,10 +39,17 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
@@ -157,7 +176,6 @@ struct Namelist {
 	};
 typedef struct Namelist Namelist;
 
-#define exponent(x) 
 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (fabs(x))
 #define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
@@ -171,8 +189,13 @@ typedef struct Namelist Namelist;
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
@@ -184,13 +207,13 @@ typedef struct Namelist Namelist;
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
-#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
-#define r_imag(z) (cimag(Cf(z)))
+#define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
@@ -230,13 +253,14 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
-#define myceiling_(w) ceil(w)
-#define myhuge_(w) HUGE_VAL
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
 #define myexp_(w) my_expfunc(w)
 
 static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;}
+
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
@@ -270,6 +294,21 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -282,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) {
 	}
 	return pow;
 }
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -294,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) {
 	}
 	return pow;
 }
+#endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
@@ -327,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n)
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -339,8 +411,25 @@ static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -352,9 +441,26 @@ static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 		}
 	}
 	pCd(z) = zdotc;
-}	
+}
+#endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -367,8 +473,25 @@ static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
diff --git a/lapack-netlib/SRC/slarmm.c b/lapack-netlib/SRC/slarmm.c
index 44d6e88d9..95114e2f1 100644
--- a/lapack-netlib/SRC/slarmm.c
+++ b/lapack-netlib/SRC/slarmm.c
@@ -1,12 +1,3 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -19,7 +10,28 @@
 #undef I
 #endif
 
-typedef int integer;
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
@@ -27,10 +39,17 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
@@ -170,8 +189,13 @@ typedef struct Namelist Namelist;
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
@@ -183,13 +207,13 @@ typedef struct Namelist Namelist;
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
-#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
-#define r_imag(z) (cimag(Cf(z)))
+#define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
@@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
-#define myceiling_(w) ceil(w)
-#define myhuge_(w) HUGE_VAL
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+#define myexp_(w) my_expfunc(w)
+
+static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;}
 
 /* procedure parameter types for -A and -C++ */
 
@@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) {
 	}
 	return pow;
 }
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) {
 	}
 	return pow;
 }
+#endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
@@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n)
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -336,8 +411,25 @@ static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -349,9 +441,26 @@ static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 		}
 	}
 	pCd(z) = zdotc;
-}	
+}
+#endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -364,8 +473,25 @@ static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -386,6 +512,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+
 /* > \brief \b SLARMM */
 
 /* Definition: */
diff --git a/lapack-netlib/SRC/slatrs3.c b/lapack-netlib/SRC/slatrs3.c
index 2d8c0ab33..e5c48a55b 100644
--- a/lapack-netlib/SRC/slatrs3.c
+++ b/lapack-netlib/SRC/slatrs3.c
@@ -1,12 +1,3 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -19,7 +10,28 @@
 #undef I
 #endif
 
-typedef int integer;
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
@@ -27,10 +39,17 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
@@ -170,8 +189,13 @@ typedef struct Namelist Namelist;
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
@@ -183,13 +207,13 @@ typedef struct Namelist Namelist;
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
-#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
-#define r_imag(z) (cimag(Cf(z)))
+#define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
@@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
-#define myceiling_(w) ceil(w)
-#define myhuge_(w) HUGE_VAL
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+#define myexp_(w) my_expfunc(w)
+
+static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;}
 
 /* procedure parameter types for -A and -C++ */
 
@@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) {
 	}
 	return pow;
 }
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) {
 	}
 	return pow;
 }
+#endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
@@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n)
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -336,8 +411,25 @@ static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -349,9 +441,26 @@ static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 		}
 	}
 	pCd(z) = zdotc;
-}	
+}
+#endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -364,8 +473,25 @@ static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -386,6 +512,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+
 /* Table of constant values */
 
 static integer c__1 = 1;
diff --git a/lapack-netlib/SRC/strsyl3.c b/lapack-netlib/SRC/strsyl3.c
index 85d68e017..8ce30ed56 100644
--- a/lapack-netlib/SRC/strsyl3.c
+++ b/lapack-netlib/SRC/strsyl3.c
@@ -1,12 +1,3 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -19,7 +10,28 @@
 #undef I
 #endif
 
-typedef int integer;
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
@@ -27,10 +39,17 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
@@ -157,7 +176,6 @@ struct Namelist {
 	};
 typedef struct Namelist Namelist;
 
-#define exponent(x) 
 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (fabs(x))
 #define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
@@ -171,8 +189,13 @@ typedef struct Namelist Namelist;
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
@@ -184,13 +207,13 @@ typedef struct Namelist Namelist;
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
-#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
-#define r_imag(z) (cimag(Cf(z)))
+#define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
@@ -230,13 +253,14 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
-#define myceiling_(w) ceil(w)
-#define myhuge_(w) HUGE_VAL
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
 #define myexp_(w) my_expfunc(w)
 
-static int my_expfunc(float* x) {int e; (void)frexpf(*x,&e); return e;}
+static int my_expfunc(float *x) {int e; (void)frexpf(*x,&e); return e;}
+
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
@@ -270,6 +294,21 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -282,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) {
 	}
 	return pow;
 }
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -294,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) {
 	}
 	return pow;
 }
+#endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
@@ -327,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n)
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -339,8 +411,25 @@ static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -352,9 +441,26 @@ static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 		}
 	}
 	pCd(z) = zdotc;
-}	
+}
+#endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -367,8 +473,25 @@ static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -389,6 +512,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+
 /* Table of constant values */
 
 static integer c__1 = 1;
diff --git a/lapack-netlib/SRC/zlatrs3.c b/lapack-netlib/SRC/zlatrs3.c
index f2d8c9bd7..0cb1cda54 100644
--- a/lapack-netlib/SRC/zlatrs3.c
+++ b/lapack-netlib/SRC/zlatrs3.c
@@ -1,12 +1,3 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -19,7 +10,28 @@
 #undef I
 #endif
 
-typedef int integer;
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
@@ -27,10 +39,17 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
@@ -170,8 +189,13 @@ typedef struct Namelist Namelist;
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
@@ -183,13 +207,13 @@ typedef struct Namelist Namelist;
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
-#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
-#define r_imag(z) (cimag(Cf(z)))
+#define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
@@ -229,10 +253,13 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
-#define myceiling_(w) ceil(w)
-#define myhuge_(w) HUGE_VAL
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+#define myexp_(w) my_expfunc(w)
+
+static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;}
 
 /* procedure parameter types for -A and -C++ */
 
@@ -267,6 +294,21 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -279,6 +321,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) {
 	}
 	return pow;
 }
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,6 +349,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) {
 	}
 	return pow;
 }
+#endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
@@ -324,6 +383,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n)
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -336,8 +411,25 @@ static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -349,9 +441,26 @@ static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 		}
 	}
 	pCd(z) = zdotc;
-}	
+}
+#endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -364,8 +473,25 @@ static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
diff --git a/lapack-netlib/SRC/ztrsyl3.c b/lapack-netlib/SRC/ztrsyl3.c
index c1be7d589..314b0f98d 100644
--- a/lapack-netlib/SRC/ztrsyl3.c
+++ b/lapack-netlib/SRC/ztrsyl3.c
@@ -1,12 +1,3 @@
-/* f2c.h  --  Standard Fortran to C header file */
-
-/**  barf  [ba:rf]  2.  "He suggested using FORTRAN, and everybody barfed."
-
-	- From The Shogakukan DICTIONARY OF NEW ENGLISH (Second edition) */
-
-#ifndef F2C_INCLUDE
-#define F2C_INCLUDE
-
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -19,7 +10,28 @@
 #undef I
 #endif
 
-typedef int integer;
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
 typedef unsigned int uinteger;
 typedef char *address;
 typedef short int shortint;
@@ -27,10 +39,17 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
 #define pCf(z) (*_pCf(z))
 #define pCd(z) (*_pCd(z))
 typedef int logical;
@@ -157,7 +176,6 @@ struct Namelist {
 	};
 typedef struct Namelist Namelist;
 
-#define exponent(x) 
 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define dabs(x) (fabs(x))
 #define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
@@ -171,8 +189,13 @@ typedef struct Namelist Namelist;
 #define abort_() { sig_die("Fortran abort routine called", 1); }
 #define c_abs(z) (cabsf(Cf(z)))
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
 #define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
 #define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
 #define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
@@ -184,13 +207,13 @@ typedef struct Namelist Namelist;
 #define d_atan(x) (atan(*(x)))
 #define d_atn2(x, y) (atan2(*(x),*(y)))
 #define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
-#define r_cnjg(R, Z) { pCf(R) = conj(Cf(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
 #define d_cos(x) (cos(*(x)))
 #define d_cosh(x) (cosh(*(x)))
 #define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
 #define d_exp(x) (exp(*(x)))
 #define d_imag(z) (cimag(Cd(z)))
-#define r_imag(z) (cimag(Cf(z)))
+#define r_imag(z) (cimagf(Cf(z)))
 #define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
 #define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
@@ -230,13 +253,15 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
 #define mycycle_() continue;
-#define myceiling_(w) ceil(w)
-#define myhuge_(w) HUGE_VAL
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
 #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
 #define myexp_(w) my_expfunc(w)
 
 static int my_expfunc(double *x) {int e; (void)frexp(*x,&e); return e;}
+
+
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
@@ -270,6 +295,21 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
 static _Complex float cpow_ui(_Complex float x, integer n) {
 	_Complex float pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -282,6 +322,22 @@ static _Complex float cpow_ui(_Complex float x, integer n) {
 	}
 	return pow;
 }
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
 static _Complex double zpow_ui(_Complex double x, integer n) {
 	_Complex double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -294,6 +350,7 @@ static _Complex double zpow_ui(_Complex double x, integer n) {
 	}
 	return pow;
 }
+#endif
 static integer pow_ii(integer x, integer n) {
 	integer pow; unsigned long int u;
 	if (n <= 0) {
@@ -327,6 +384,22 @@ static integer smaxloc_(float *w, integer s, integer e, integer *n)
 }
 static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -339,8 +412,25 @@ static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -352,9 +442,26 @@ static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 		}
 	}
 	pCd(z) = zdotc;
-}	
+}
+#endif	
 static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
 	_Complex float zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -367,8 +474,25 @@ static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, c
 	}
 	pCf(z) = zdotc;
 }
+#endif
 static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
 	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
 	_Complex double zdotc = 0.0;
 	if (incx == 1 && incy == 1) {
 		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
@@ -389,6 +513,9 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+
+
+
 /* Table of constant values */
 
 static doublecomplex c_b1 = {1.,0.};

From 258a83a642acd01cf3cc248f8ccacd6a22dbb566 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Nov 2022 21:57:42 +0100
Subject: [PATCH 099/154] change line endings from CRLF to LF

---
 cmake/lapack.cmake | 2028 ++++++++++++++++++++++----------------------
 1 file changed, 1014 insertions(+), 1014 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 82511d41b..ca3a1e184 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -1,1014 +1,1014 @@
-# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files.
-if (NOT C_LAPACK)
-	message (STATUS "fortran lapack")
-set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F
-   ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f la_xisnan.F90
-   ../INSTALL/ilaver.f xerbla_array.f
-   ../INSTALL/slamch.f)
-
-set(SCLAUX
-	scombssq.f sbdsvdx.f sstevx.f sstein.f
-   la_constants.f90
-   sbdsdc.f
-   sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f  slaebz.f
-   slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f
-   slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f
-   slagts.f slamrg.f slanst.f
-   slapy2.f slapy3.f slarnv.f
-   slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f
-   slarrk.f slarrr.f slaneg.f
-   slartg.f90 slaruv.f slas2.f  slascl.f
-   slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f
-   slasd7.f slasd8.f slasda.f slasdq.f slasdt.f
-   slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f
-   slasr.f  slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f
-   ssteqr.f ssterf.f slaisnan.f sisnan.f
-   slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f
-   ../INSTALL/second_${TIMER}.f)
-
-set(DZLAUX
-   la_constants.f90
-   dbdsdc.f
-   dbdsvdx.f dstevx.f dstein.f
-   dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f  dlaebz.f
-   dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f
-   dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f
-   dlagts.f dlamrg.f dlanst.f
-   dlapy2.f dlapy3.f dlarnv.f
-   dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f
-   dlarrk.f dlarrr.f dlaneg.f
-   dlartg.f90 dlaruv.f dlas2.f  dlascl.f
-   dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f
-   dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f
-   dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
-   dlasr.f  dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f
-   dsteqr.f dsterf.f dlaisnan.f disnan.f
-   dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f
-   ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f)
-
-set(SLASRC
-   sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
-   sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
-   sgebrd.f sgecon.f sgeequ.f sgees.f  sgeesx.f sgeev.f  sgeevx.f
-   sgehd2.f sgehrd.f sgelq2.f sgelqf.f
-   sgels.f  sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f
-   sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
-   sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f
-   sgetrf2.f sgetri.f
-   sggbak.f sggbal.f
-   sgges.f  sgges3.f sggesx.f sggev.f  sggev3.f sggevx.f
-   sggglm.f sgghrd.f sgghd3.f sgglse.f sggqrf.f
-   sggrqf.f sggsvd3.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f
-   sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
-   shsein.f shseqr.f slabrd.f slacon.f slacn2.f
-   slaqz0.f slaqz1.f slaqz2.f slaqz3.f slaqz4.f
-   slaein.f slaexc.f slag2.f  slags2.f slagtm.f slagv2.f slahqr.f
-   slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
-   slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
-   slansy.f slantb.f slantp.f slantr.f slanv2.f
-   slapll.f slapmt.f
-   slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
-   slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
-   slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
-   slarf.f  slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
-   slarrv.f slartv.f
-   slarz.f  slarzb.f slarzt.f slasy2.f
-   slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
-   slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f
-   sopgtr.f sopmtr.f sorg2l.f sorg2r.f
-   sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
-   sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f
-   sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f
-   sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f
-   spbstf.f spbsv.f  spbsvx.f
-   spbtf2.f spbtrf.f spbtrs.f spocon.f spoequ.f sporfs.f sposv.f
-   sposvx.f spotrf2.f spotri.f spstrf.f spstf2.f
-   sppcon.f sppequ.f
-   spprfs.f sppsv.f  sppsvx.f spptrf.f spptri.f spptrs.f sptcon.f
-   spteqr.f sptrfs.f sptsv.f  sptsvx.f spttrs.f sptts2.f srscl.f
-   ssbev.f  ssbevd.f ssbevx.f ssbgst.f ssbgv.f  ssbgvd.f ssbgvx.f
-   ssbtrd.f sspcon.f sspev.f  sspevd.f sspevx.f sspgst.f
-   sspgv.f  sspgvd.f sspgvx.f ssprfs.f sspsv.f  sspsvx.f ssptrd.f
-   ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f  sstevd.f sstevr.f
-   ssycon.f ssyev.f  ssyevd.f ssyevr.f ssyevx.f ssygs2.f
-   ssygst.f ssygv.f  ssygvd.f ssygvx.f ssyrfs.f ssysv.f  ssysvx.f
-   ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f
-   ssyswapr.f ssytrs.f ssytrs2.f
-   ssyconv.f ssyconvf.f ssyconvf_rook.f
-   ssysv_aa.f ssysv_aa_2stage.f ssytrf_aa.f ssytrf_aa_2stage.f ssytrs_aa.f ssytrs_aa_2stage.f
-   ssytf2_rook.f ssytrf_rook.f ssytrs_rook.f
-   ssytri_rook.f ssycon_rook.f ssysv_rook.f
-   ssytf2_rk.f ssytrf_rk.f ssytrs_3.f
-   ssytri_3.f ssytri_3x.f ssycon_3.f ssysv_rk.f
-   ssysv_aa.f ssytrf_aa.f ssytrs_aa.f
-   stbcon.f
-   stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f
-   stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f
-   stptrs.f
-   strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
-   strtrs.f stzrzf.f sstemr.f
-   slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f
-   stfttr.f stpttf.f stpttr.f strttf.f strttp.f
-   sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f
-   sgeequb.f ssyequb.f spoequb.f sgbequb.f
-   sbbcsd.f slapmr.f sorbdb.f sorbdb1.f sorbdb2.f sorbdb3.f sorbdb4.f
-   sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f
-   sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
-   stpqrt.f stpqrt2.f stpmqrt.f stprfb.f
-   sgelqt.f sgelqt3.f sgemlqt.f
-   sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
-   sgelq.f slaswlq.f slamswlq.f sgemlq.f
-   stplqt.f stplqt2.f stpmlqt.f
-   ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
-   ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
-   ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
-   sgesvdq.f slaorhr_col_getrfnp.f
-   slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f 
-   slarmm.f slatrs3.f strsyl3.f)
-
-set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
-   sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
-   sla_syrfsx_extended.f sla_syamv.f sla_syrcond.f sla_syrpvgrw.f
-   sposvxx.f sporfsx.f sla_porfsx_extended.f sla_porcond.f
-   sla_porpvgrw.f sgbsvxx.f sgbrfsx.f sla_gbrfsx_extended.f
-   sla_gbamv.f sla_gbrcond.f sla_gbrpvgrw.f sla_lin_berr.f slarscl2.f
-   slascl2.f sla_wwaddw.f)
-
-set(CLASRC
-   cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f  cgbsvx.f
-   cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
-   cgecon.f cgeequ.f cgees.f  cgeesx.f cgeev.f  cgeevx.f
-   cgehd2.f cgehrd.f cgelq2.f cgelqf.f
-   cgels.f  cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
-   cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f
-   cgesc2.f cgesdd.f cgesvd.f cgesvdx.f
-   cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f
-   cgesvx.f cgetc2.f cgetrf2.f
-   cgetri.f
-   cggbak.f cggbal.f
-   cgges.f  cgges3.f cggesx.f cggev.f  cggev3.f cggevx.f
-   cggglm.f cgghrd.f cgghd3.f cgglse.f cggqrf.f cggrqf.f
-   cggsvd3.f cggsvp3.f
-   cgtcon.f cgtrfs.f cgtsv.f  cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f
-   chbevd.f chbevx.f chbgst.f chbgv.f  chbgvd.f chbgvx.f chbtrd.f
-   checon.f cheev.f  cheevd.f cheevr.f cheevx.f chegs2.f chegst.f
-   chegv.f  chegvd.f chegvx.f cherfs.f chesv.f  chesvx.f chetd2.f
-   chetf2.f chetrd.f
-   chetrf.f chetri.f chetri2.f chetri2x.f cheswapr.f
-   chetrs.f chetrs2.f
-   chetf2_rook.f chetrf_rook.f chetri_rook.f
-   chetrs_rook.f checon_rook.f chesv_rook.f
-   chetf2_rk.f chetrf_rk.f chetri_3.f chetri_3x.f
-   chetrs_3.f checon_3.f chesv_rk.f
-   chesv_aa.f chesv_aa_2stage.f chetrf_aa.f chetrf_aa_2stage.f chetrs_aa.f chetrs_aa_2stage.f
-   chgeqz.f chpcon.f chpev.f  chpevd.f
-   chpevx.f chpgst.f chpgv.f  chpgvd.f chpgvx.f chprfs.f chpsv.f
-   chpsvx.f
-   chptrd.f chptrf.f chptri.f chptrs.f chsein.f chseqr.f clabrd.f
-   clacgv.f clacon.f clacn2.f clacp2.f clacpy.f clacrm.f clacrt.f cladiv.f
-   claed0.f claed7.f claed8.f
-   claein.f claesy.f claev2.f clags2.f clagtm.f
-   clahef.f clahef_rook.f clahef_rk.f clahef_aa.f clahqr.f
-   clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
-   clanhb.f clanhe.f
-   clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
-   clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
-   claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
-   claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
-   claqz0.f claqz1.f claqz2.f claqz3.f
-   claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
-   clarf.f  clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
-   clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
-   clarz.f  clarzb.f clarzt.f clascl.f claset.f clasr.f  classq.f90
-   clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
-   clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f
-   cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
-   cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f
-   cposv.f  cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
-   cppcon.f cppequ.f cpprfs.f cppsv.f  cppsvx.f cpptrf.f cpptri.f cpptrs.f
-   cptcon.f cpteqr.f cptrfs.f cptsv.f  cptsvx.f cpttrf.f cpttrs.f cptts2.f
-   crot.f   cspcon.f csprfs.f cspsv.f
-   cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
-   cstegr.f cstein.f csteqr.f csycon.f
-   csyrfs.f csysv.f  csysvx.f csytf2.f csytrf.f csytri.f
-   csytri2.f csytri2x.f csyswapr.f
-   csytrs.f csytrs2.f
-   csyconv.f csyconvf.f csyconvf_rook.f
-   csytf2_rook.f csytrf_rook.f csytrs_rook.f
-   csytri_rook.f csycon_rook.f csysv_rook.f
-   csytf2_rk.f csytrf_rk.f csytrf_aa.f csytrf_aa_2stage.f csytrs_3.f csytrs_aa.f csytrs_aa_2stage.f
-   csytri_3.f csytri_3x.f csycon_3.f csysv_rk.f csysv_aa.f csysv_aa_2stage.f
-   ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f
-   ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f
-   ctprfs.f ctptri.f
-   ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
-   ctrsyl.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f
-   cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
-   cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f
-   cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f
-   cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f
-   chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f
-   ctfttr.f ctpttf.f ctpttr.f ctrttf.f ctrttp.f
-   cgeequb.f cgbequb.f csyequb.f cpoequb.f cheequb.f
-   cbbcsd.f clapmr.f cunbdb.f cunbdb1.f cunbdb2.f cunbdb3.f cunbdb4.f
-   cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f
-   cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
-   ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f
-   cgelqt.f cgelqt3.f cgemlqt.f
-   cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
-   cgelq.f claswlq.f clamswlq.f cgemlq.f
-   ctplqt.f ctplqt2.f ctpmlqt.f
-   chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
-   cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
-   chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
-   cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
-   cungtsqr.f cungtsqr_row.f cunhr_col.f 
-   clatrs3.f ctrsyl3.f )
-
-set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
-   cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
-   csysvxx.f csyrfsx.f cla_syrfsx_extended.f cla_syamv.f
-   cla_syrcond_c.f cla_syrcond_x.f cla_syrpvgrw.f
-   cposvxx.f cporfsx.f cla_porfsx_extended.f
-   cla_porcond_c.f cla_porcond_x.f cla_porpvgrw.f
-   cgbsvxx.f cgbrfsx.f cla_gbrfsx_extended.f cla_gbamv.f
-   cla_gbrcond_c.f cla_gbrcond_x.f cla_gbrpvgrw.f
-   chesvxx.f cherfsx.f cla_herfsx_extended.f cla_heamv.f
-   cla_hercond_c.f cla_hercond_x.f cla_herpvgrw.f
-   cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f)
-
-set(DLASRC
-   dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f
-   dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f
-   dgebrd.f dgecon.f dgeequ.f dgees.f  dgeesx.f dgeev.f  dgeevx.f
-   dgehd2.f dgehrd.f dgelq2.f dgelqf.f
-   dgels.f  dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f
-   dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
-   dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f
-   dgetrf2.f dgetri.f
-   dggbak.f dggbal.f
-   dgges.f  dgges3.f dggesx.f dggev.f  dggev3.f dggevx.f
-   dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f
-   dggrqf.f dggsvd3.f dggsvp3.f dgtcon.f dgtrfs.f dgtsv.f
-   dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
-   dlaqz0.f dlaqz1.f dlaqz2.f dlaqz3.f dlaqz4.f
-   dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
-   dlaein.f dlaexc.f dlag2.f  dlags2.f dlagtm.f dlagv2.f dlahqr.f
-   dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
-   dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
-   dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
-   dlapll.f dlapmt.f
-   dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
-   dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
-   dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
-   dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
-   dlargv.f dlarrv.f dlartv.f
-   dlarz.f  dlarzb.f dlarzt.f dlasy2.f
-   dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
-   dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f
-   dopgtr.f dopmtr.f dorg2l.f dorg2r.f
-   dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
-   dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f
-   dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f
-   dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f
-   dpbstf.f dpbsv.f  dpbsvx.f
-   dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f
-   dposvx.f dpotrf2.f dpotri.f dpotrs.f dpstrf.f dpstf2.f
-   dppcon.f dppequ.f
-   dpprfs.f dppsv.f  dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f
-   dpteqr.f dptrfs.f dptsv.f  dptsvx.f dpttrs.f dptts2.f drscl.f
-   dsbev.f  dsbevd.f dsbevx.f dsbgst.f dsbgv.f  dsbgvd.f dsbgvx.f
-   dsbtrd.f dspcon.f dspev.f  dspevd.f dspevx.f dspgst.f
-   dspgv.f  dspgvd.f dspgvx.f dsprfs.f dspsv.f  dspsvx.f dsptrd.f
-   dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f  dstevd.f dstevr.f
-   dsycon.f dsyev.f  dsyevd.f dsyevr.f
-   dsyevx.f dsygs2.f dsygst.f dsygv.f  dsygvd.f dsygvx.f dsyrfs.f
-   dsysv.f  dsysvx.f
-   dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f
-   dsytri2.f dsytri2x.f dsyswapr.f
-   dsyconv.f dsyconvf.f dsyconvf_rook.f
-   dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f
-   dsytri_rook.f dsycon_rook.f dsysv_rook.f
-   dsytf2_rk.f dsytrf_rk.f dsytrs_3.f
-   dsytri_3.f dsytri_3x.f dsycon_3.f dsysv_rk.f
-   dsysv_aa.f dsysv_aa_2stage.f dsytrf_aa.f dsytrf_aa_2stage.f dsytrs_aa.f dsytrs_aa_2stage.f
-   dtbcon.f
-   dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f
-   dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f
-   dtptrs.f
-   dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
-   dtrtrs.f dtzrzf.f dstemr.f
-   dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f
-   dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f
-   dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f
-   dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f
-   dgeequb.f dsyequb.f dpoequb.f dgbequb.f
-   dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f
-   dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f
-   dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
-   dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f
-   dgelqt.f dgelqt3.f dgemlqt.f
-   dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
-   dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
-   dtplqt.f dtplqt2.f dtpmlqt.f
-   dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
-   dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
-   dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
-   dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
-   dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f 
-   dlarmm.f dlatrs3.f dtrsyl3.f)
-
-set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
-   dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
-   dla_syrfsx_extended.f dla_syamv.f dla_syrcond.f dla_syrpvgrw.f
-   dposvxx.f dporfsx.f dla_porfsx_extended.f dla_porcond.f
-   dla_porpvgrw.f dgbsvxx.f dgbrfsx.f dla_gbrfsx_extended.f
-   dla_gbamv.f dla_gbrcond.f dla_gbrpvgrw.f dla_lin_berr.f dlarscl2.f
-   dlascl2.f dla_wwaddw.f)
-
-set(ZLASRC
-   zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f  zgbsvx.f
-   zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
-   zgecon.f zgeequ.f zgees.f  zgeesx.f zgeev.f  zgeevx.f
-   zgehd2.f zgehrd.f zgelq2.f zgelqf.f
-   zgels.f  zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
-   zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
-   zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f
-   zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f
-   zgetc2.f zgetrf2.f
-   zgetri.f
-   zggbak.f zggbal.f
-   zgges.f  zgges3.f zggesx.f zggev.f  zggev3.f zggevx.f
-   zggglm.f zgghrd.f zgghd3.f zgglse.f zggqrf.f zggrqf.f
-   zggsvd3.f zggsvp3.f
-   zgtcon.f zgtrfs.f zgtsv.f  zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
-   zhbevd.f zhbevx.f zhbgst.f zhbgv.f  zhbgvd.f zhbgvx.f zhbtrd.f
-   zhecon.f zheev.f  zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f
-   zhegv.f  zhegvd.f zhegvx.f zherfs.f zhesv.f  zhesvx.f zhetd2.f
-   zhetf2.f zhetrd.f
-   zhetrf.f zhetri.f zhetri2.f zhetri2x.f zheswapr.f
-   zhetrs.f zhetrs2.f
-   zhetf2_rook.f zhetrf_rook.f zhetri_rook.f
-   zhetrs_rook.f zhecon_rook.f zhesv_rook.f
-   zhetf2_rk.f zhetrf_rk.f zhetri_3.f zhetri_3x.f
-   zhetrs_3.f zhecon_3.f zhesv_rk.f
-   zhesv_aa.f zhesv_aa_2stage.f zhetrf_aa.f zhetrf_aa_2stage.f zhetrs_aa.f zhetrs_aa_2stage.f
-   zhgeqz.f zhpcon.f zhpev.f  zhpevd.f
-   zlaqz0.f zlaqz1.f zlaqz2.f zlaqz3.f
-   zhpevx.f zhpgst.f zhpgv.f  zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f
-   zhpsvx.f
-   zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f
-   zlacgv.f zlacon.f zlacn2.f zlacp2.f zlacpy.f zlacrm.f zlacrt.f zladiv.f
-   zlaed0.f zlaed7.f zlaed8.f
-   zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f
-   zlahef.f zlahef_rook.f zlahef_rk.f zlahef_aa.f zlahqr.f
-   zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
-   zlangt.f zlanhb.f
-   zlanhe.f
-   zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
-   zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f
-   zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
-   zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
-   zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
-   zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f
-   zlarfg.f zlarfgp.f zlarft.f
-   zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
-   zlarz.f  zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
-   zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f
-   zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f
-   zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
-   zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
-   zposv.f  zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
-   zppcon.f zppequ.f zpprfs.f zppsv.f  zppsvx.f zpptrf.f zpptri.f zpptrs.f
-   zptcon.f zpteqr.f zptrfs.f zptsv.f  zptsvx.f zpttrf.f zpttrs.f zptts2.f
-   zrot.f   zspcon.f zsprfs.f zspsv.f
-   zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
-   zstegr.f zstein.f zsteqr.f zsycon.f
-   zsyrfs.f zsysv.f  zsysvx.f zsytf2.f zsytrf.f zsytri.f
-   zsytri2.f zsytri2x.f zsyswapr.f
-   zsytrs.f zsytrs2.f
-   zsyconv.f zsyconvf.f zsyconvf_rook.f
-   zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f zsytrs_aa.f zsytrs_aa_2stage.f
-   zsytri_rook.f zsycon_rook.f zsysv_rook.f
-   zsytf2_rk.f zsytrf_rk.f zsytrf_aa.f zsytrf_aa_2stage.f zsytrs_3.f
-   zsytri_3.f zsytri_3x.f zsycon_3.f zsysv_rk.f zsysv_aa.f zsysv_aa_2stage.f
-   ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f
-   ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f
-   ztprfs.f ztptri.f
-   ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
-   ztrsyl.f ztrtrs.f ztzrzf.f zung2l.f
-   zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
-   zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f
-   zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f
-   zunmtr.f zupgtr.f
-   zupmtr.f izmax1.f dzsum1.f zstemr.f
-   zcgesv.f zcposv.f zlag2c.f clag2z.f zlat2c.f
-   zhfrk.f ztfttp.f zlanhf.f zpftrf.f zpftri.f zpftrs.f ztfsm.f ztftri.f
-   ztfttr.f ztpttf.f ztpttr.f ztrttf.f ztrttp.f
-   zgeequb.f zgbequb.f zsyequb.f zpoequb.f zheequb.f
-   zbbcsd.f zlapmr.f zunbdb.f zunbdb1.f zunbdb2.f zunbdb3.f zunbdb4.f
-   zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f
-   zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f
-   ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f
-   ztplqt.f ztplqt2.f ztpmlqt.f
-   zgelqt.f zgelqt3.f zgemlqt.f
-   zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
-   zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
-   zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
-   zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
-   zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
-   zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
-   zungtsqr.f zungtsqr_row.f zunhr_col.f
-   zlatrs3.f ztrsyl3.f)
-
-set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
-   zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
-   zla_syrfsx_extended.f zla_syamv.f zla_syrcond_c.f zla_syrcond_x.f
-   zla_syrpvgrw.f zposvxx.f zporfsx.f zla_porfsx_extended.f
-   zla_porcond_c.f zla_porcond_x.f zla_porpvgrw.f zgbsvxx.f zgbrfsx.f
-   zla_gbrfsx_extended.f zla_gbamv.f zla_gbrcond_c.f zla_gbrcond_x.f
-   zla_gbrpvgrw.f zhesvxx.f zherfsx.f zla_herfsx_extended.f
-   zla_heamv.f zla_hercond_c.f zla_hercond_x.f zla_herpvgrw.f
-   zla_lin_berr.f zlarscl2.f zlascl2.f zla_wwaddw.f)
-
-
-if(USE_XBLAS)
-  set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
-endif()
-
-list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
-  DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
-  DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
-list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f
-  DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f
-  DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f)
-list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f
-  DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f
-  DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f)
-list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
-  DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
-  DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
-message(STATUS "Building deprecated routines")
-
-set(DSLASRC spotrs.f)
-
-set(ZCLASRC cpotrs.f)
-
-set(SCATGEN slatm1.f slaran.f slarnd.f)
-
-set(SMATGEN slatms.f slatme.f slatmr.f slatmt.f
-   slagge.f slagsy.f slakf2.f slarge.f slaror.f slarot.f slatm2.f
-   slatm3.f slatm5.f slatm6.f slatm7.f slahilb.f)
-
-set(CMATGEN clatms.f clatme.f clatmr.f clatmt.f
-   clagge.f claghe.f clagsy.f clakf2.f clarge.f claror.f clarot.f
-   clatm1.f clarnd.f clatm2.f clatm3.f clatm5.f clatm6.f clahilb.f slatm7.f)
-
-set(DZATGEN dlatm1.f dlaran.f dlarnd.f)
-
-set(DMATGEN dlatms.f dlatme.f dlatmr.f dlatmt.f
-   dlagge.f dlagsy.f dlakf2.f dlarge.f dlaror.f dlarot.f dlatm2.f
-   dlatm3.f dlatm5.f dlatm6.f dlatm7.f dlahilb.f)
-
-set(ZMATGEN zlatms.f zlatme.f zlatmr.f zlatmt.f
-  zlagge.f zlaghe.f zlagsy.f zlakf2.f zlarge.f zlaror.f zlarot.f
-  zlatm1.f zlarnd.f zlatm2.f zlatm3.f zlatm5.f zlatm6.f zlahilb.f dlatm7.f)
-
-if(BUILD_SINGLE)
-  set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX})
-  set(LA_GEN_SRC ${SMATGEN} ${SCATGEN})
-  message(STATUS "Building Single Precision")
-endif()
-if(BUILD_DOUBLE)
-  set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX})
-  set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN})
-  message(STATUS "Building Double Precision")
-endif()
-if(BUILD_COMPLEX)
-  set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX})
-  SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN})
-  message(STATUS "Building Single Precision Complex")
-endif()
-if(BUILD_COMPLEX16)
-  set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX})
-  SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN})
-# for zlange/zlanhe
-  if (NOT BUILD_DOUBLE)
-    set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f)
-  endif	()  
-  message(STATUS "Building Double Precision Complex")
-endif()
-
-else ()
-
-	message (STATUS "c lapack")
-set(ALLAUX ilaenv.c ilaenv2stage.c ieeeck.c lsamen.c iparmq.c iparam2stage.c
-   ilaprec.c ilatrans.c ilauplo.c iladiag.c chla_transtype.c dlaset.c
-   ../INSTALL/ilaver.c xerbla_array.c
-   ../INSTALL/slamch.c)
-
-set(SCLAUX
-	scombssq.c sbdsvdx.c sstevx.c sstein.c
-   sbdsdc.c
-   sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c  slaebz.c
-   slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c
-   slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c
-   slagts.c slamrg.c slanst.c
-   slapy2.c slapy3.c slarnv.c
-   slarra.c slarrb.c slarrc.c slarrd.c slarre.c slarrf.c slarrj.c
-   slarrk.c slarrr.c slaneg.c
-   slartg.c slaruv.c slas2.c  slascl.c
-   slasd0.c slasd1.c slasd2.c slasd3.c slasd4.c slasd5.c slasd6.c
-   slasd7.c slasd8.c slasda.c slasdq.c slasdt.c
-   slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c
-   slasr.c  slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c
-   ssteqr.c ssterf.c slaisnan.c sisnan.c
-   slartgp.c slartgs.c
-   ../INSTALL/second_${TIMER}.c)
-
-set(DZLAUX
-   dbdsdc.c
-   dbdsvdx.c dstevx.c dstein.c
-   dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c  dlaebz.c
-   dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c
-   dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c
-   dlagts.c dlamrg.c dlanst.c
-   dlapy2.c dlapy3.c dlarnv.c
-   dlarra.c dlarrb.c dlarrc.c dlarrd.c dlarre.c dlarrf.c dlarrj.c
-   dlarrk.c dlarrr.c dlaneg.c
-   dlartg.c dlaruv.c dlas2.c  dlascl.c
-   dlasd0.c dlasd1.c dlasd2.c dlasd3.c dlasd4.c dlasd5.c dlasd6.c
-   dlasd7.c dlasd8.c dlasda.c dlasdq.c dlasdt.c
-   dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c
-   dlasr.c  dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c
-   dsteqr.c dsterf.c dlaisnan.c disnan.c
-   dlartgp.c dlartgs.c
-   ../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c)
-
-set(SLASRC
-   sgbbrd.c sgbcon.c sgbequ.c sgbrfs.c sgbsv.c
-   sgbsvx.c sgbtf2.c sgbtrf.c sgbtrs.c sgebak.c sgebal.c sgebd2.c
-   sgebrd.c sgecon.c sgeequ.c sgees.c  sgeesx.c sgeev.c  sgeevx.c
-   sgehd2.c sgehrd.c sgelq2.c sgelqf.c
-   sgels.c  sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c
-   sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
-   sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c
-   sgetrf2.c sgetri.c
-   sggbak.c sggbal.c
-   sgges.c  sgges3.c sggesx.c sggev.c  sggev3.c sggevx.c
-   sggglm.c sgghrd.c sgghd3.c sgglse.c sggqrf.c
-   sggrqf.c sggsvd3.c sggsvp3.c sgtcon.c sgtrfs.c sgtsv.c
-   sgtsvx.c sgttrf.c sgttrs.c sgtts2.c shgeqz.c
-   shsein.c shseqr.c slabrd.c slacon.c slacn2.c
-   slaein.c slaexc.c slag2.c  slags2.c slagtm.c slagv2.c slahqr.c
-   slahr2.c slaic1.c slaln2.c slals0.c slalsa.c slalsd.c
-   slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c
-   slansy.c slantb.c slantp.c slantr.c slanv2.c
-   slapll.c slapmt.c
-   slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
-   slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
-   slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
-   slarf.c  slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
-   slarrv.c slartv.c
-   slarz.c  slarzb.c slarzt.c slasy2.c
-   slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c
-   slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c
-   sopgtr.c sopmtr.c sorg2l.c sorg2r.c
-   sorgbr.c sorghr.c sorgl2.c sorglq.c sorgql.c sorgqr.c sorgr2.c
-   sorgrq.c sorgtr.c sorm2l.c sorm2r.c sorm22.c
-   sormbr.c sormhr.c sorml2.c sormlq.c sormql.c sormqr.c sormr2.c
-   sormr3.c sormrq.c sormrz.c sormtr.c spbcon.c spbequ.c spbrfs.c
-   spbstf.c spbsv.c  spbsvx.c
-   spbtf2.c spbtrf.c spbtrs.c spocon.c spoequ.c sporfs.c sposv.c
-   sposvx.c spotrf2.c spotri.c spstrf.c spstf2.c
-   sppcon.c sppequ.c
-   spprfs.c sppsv.c  sppsvx.c spptrf.c spptri.c spptrs.c sptcon.c
-   spteqr.c sptrfs.c sptsv.c  sptsvx.c spttrs.c sptts2.c srscl.c
-   ssbev.c  ssbevd.c ssbevx.c ssbgst.c ssbgv.c  ssbgvd.c ssbgvx.c
-   ssbtrd.c sspcon.c sspev.c  sspevd.c sspevx.c sspgst.c
-   sspgv.c  sspgvd.c sspgvx.c ssprfs.c sspsv.c  sspsvx.c ssptrd.c
-   ssptrf.c ssptri.c ssptrs.c sstegr.c sstev.c  sstevd.c sstevr.c
-   ssycon.c ssyev.c  ssyevd.c ssyevr.c ssyevx.c ssygs2.c
-   ssygst.c ssygv.c  ssygvd.c ssygvx.c ssyrfs.c ssysv.c  ssysvx.c
-   ssytd2.c ssytf2.c ssytrd.c ssytrf.c ssytri.c ssytri2.c ssytri2x.c
-   ssyswapr.c ssytrs.c ssytrs2.c
-   ssyconv.c ssyconvf.c ssyconvf_rook.c
-   ssysv_aa.c ssysv_aa_2stage.c ssytrf_aa.c ssytrf_aa_2stage.c ssytrs_aa.c ssytrs_aa_2stage.c
-   ssytf2_rook.c ssytrf_rook.c ssytrs_rook.c
-   ssytri_rook.c ssycon_rook.c ssysv_rook.c
-   ssytf2_rk.c ssytrf_rk.c ssytrs_3.c
-   ssytri_3.c ssytri_3x.c ssycon_3.c ssysv_rk.c
-   ssysv_aa.c ssytrf_aa.c ssytrs_aa.c
-   stbcon.c
-   stbrfs.c stbtrs.c stgevc.c stgex2.c stgexc.c stgsen.c
-   stgsja.c stgsna.c stgsy2.c stgsyl.c stpcon.c stprfs.c stptri.c
-   stptrs.c
-   strcon.c strevc.c strevc3.c strexc.c strrfs.c strsen.c strsna.c strsyl.c
-   strtrs.c stzrzf.c sstemr.c
-   slansf.c spftrf.c spftri.c spftrs.c ssfrk.c stfsm.c stftri.c stfttp.c
-   stfttr.c stpttf.c stpttr.c strttf.c strttp.c
-   sgejsv.c sgesvj.c sgsvj0.c sgsvj1.c
-   sgeequb.c ssyequb.c spoequb.c sgbequb.c
-   sbbcsd.c slapmr.c sorbdb.c sorbdb1.c sorbdb2.c sorbdb3.c sorbdb4.c
-   sorbdb5.c sorbdb6.c sorcsd.c sorcsd2by1.c
-   sgeqrt.c sgeqrt2.c sgeqrt3.c sgemqrt.c
-   stpqrt.c stpqrt2.c stpmqrt.c stprfb.c
-   sgelqt.c sgelqt3.c sgemlqt.c
-   sgetsls.c sgetsqrhrt.c sgeqr.c slatsqr.c slamtsqr.c sgemqr.c
-   sgelq.c slaswlq.c slamswlq.c sgemlq.c
-   stplqt.c stplqt2.c stpmlqt.c
-   ssytrd_2stage.c ssytrd_sy2sb.c ssytrd_sb2st.c ssb2st_kernels.c
-   ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c
-   ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
-   sgesvdq.c slaorhr_col_getrfnp.c
-   slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c 
-   slarmm.c slatrs3.c strsyl3.c)
-
-set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
-   sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
-   sla_syrfsx_extended.c sla_syamv.c sla_syrcond.c sla_syrpvgrw.c
-   sposvxx.c sporfsx.c sla_porfsx_extended.c sla_porcond.c
-   sla_porpvgrw.c sgbsvxx.c sgbrfsx.c sla_gbrfsx_extended.c
-   sla_gbamv.c sla_gbrcond.c sla_gbrpvgrw.c sla_lin_berr.c slarscl2.c
-   slascl2.c sla_wwaddw.c)
-
-set(CLASRC
-   cbdsqr.c cgbbrd.c cgbcon.c cgbequ.c cgbrfs.c cgbsv.c  cgbsvx.c
-   cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c
-   cgecon.c cgeequ.c cgees.c  cgeesx.c cgeev.c  cgeevx.c
-   cgehd2.c cgehrd.c cgelq2.c cgelqf.c
-   cgels.c  cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
-   cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
-   cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
-   cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
-   cgesvx.c cgetc2.c cgetrf2.c
-   cgetri.c
-   cggbak.c cggbal.c
-   cgges.c  cgges3.c cggesx.c cggev.c  cggev3.c cggevx.c
-   cggglm.c cgghrd.c cgghd3.c cgglse.c cggqrf.c cggrqf.c
-   cggsvd3.c cggsvp3.c
-   cgtcon.c cgtrfs.c cgtsv.c  cgtsvx.c cgttrf.c cgttrs.c cgtts2.c chbev.c
-   chbevd.c chbevx.c chbgst.c chbgv.c  chbgvd.c chbgvx.c chbtrd.c
-   checon.c cheev.c  cheevd.c cheevr.c cheevx.c chegs2.c chegst.c
-   chegv.c  chegvd.c chegvx.c cherfs.c chesv.c  chesvx.c chetd2.c
-   chetf2.c chetrd.c
-   chetrf.c chetri.c chetri2.c chetri2x.c cheswapr.c
-   chetrs.c chetrs2.c
-   chetf2_rook.c chetrf_rook.c chetri_rook.c
-   chetrs_rook.c checon_rook.c chesv_rook.c
-   chetf2_rk.c chetrf_rk.c chetri_3.c chetri_3x.c
-   chetrs_3.c checon_3.c chesv_rk.c
-   chesv_aa.c chesv_aa_2stage.c chetrf_aa.c chetrf_aa_2stage.c chetrs_aa.c chetrs_aa_2stage.c
-   chgeqz.c chpcon.c chpev.c  chpevd.c
-   chpevx.c chpgst.c chpgv.c  chpgvd.c chpgvx.c chprfs.c chpsv.c
-   chpsvx.c
-   chptrd.c chptrf.c chptri.c chptrs.c chsein.c chseqr.c clabrd.c
-   clacgv.c clacon.c clacn2.c clacp2.c clacpy.c clacrm.c clacrt.c cladiv.c
-   claed0.c claed7.c claed8.c
-   claein.c claesy.c claev2.c clags2.c clagtm.c
-   clahef.c clahef_rook.c clahef_rk.c clahef_aa.c clahqr.c
-   clahr2.c claic1.c clals0.c clalsa.c clalsd.c clangb.c clange.c clangt.c
-   clanhb.c clanhe.c
-   clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c
-   clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c
-   claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c
-   claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
-   claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
-   clarf.c  clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
-   clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c
-   clarz.c  clarzb.c clarzt.c clascl.c claset.c clasr.c  classq.c
-   clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c
-   clatbs.c clatdf.c clatps.c clatrd.c clatrs.c clatrz.c
-   cpbcon.c cpbequ.c cpbrfs.c cpbstf.c cpbsv.c
-   cpbsvx.c cpbtf2.c cpbtrf.c cpbtrs.c cpocon.c cpoequ.c cporfs.c
-   cposv.c  cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
-   cppcon.c cppequ.c cpprfs.c cppsv.c  cppsvx.c cpptrf.c cpptri.c cpptrs.c
-   cptcon.c cpteqr.c cptrfs.c cptsv.c  cptsvx.c cpttrf.c cpttrs.c cptts2.c
-   crot.c   cspcon.c csprfs.c cspsv.c
-   cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
-   cstegr.c cstein.c csteqr.c csycon.c
-   csyrfs.c csysv.c  csysvx.c csytf2.c csytrf.c csytri.c
-   csytri2.c csytri2x.c csyswapr.c
-   csytrs.c csytrs2.c
-   csyconv.c csyconvf.c csyconvf_rook.c
-   csytf2_rook.c csytrf_rook.c csytrs_rook.c
-   csytri_rook.c csycon_rook.c csysv_rook.c
-   csytf2_rk.c csytrf_rk.c csytrf_aa.c csytrf_aa_2stage.c csytrs_3.c csytrs_aa.c csytrs_aa_2stage.c
-   csytri_3.c csytri_3x.c csycon_3.c csysv_rk.c csysv_aa.c csysv_aa_2stage.c
-   ctbcon.c ctbrfs.c ctbtrs.c ctgevc.c ctgex2.c
-   ctgexc.c ctgsen.c ctgsja.c ctgsna.c ctgsy2.c ctgsyl.c ctpcon.c
-   ctprfs.c ctptri.c
-   ctptrs.c ctrcon.c ctrevc.c ctrevc3.c ctrexc.c ctrrfs.c ctrsen.c ctrsna.c
-   ctrsyl.c ctrtrs.c ctzrzf.c cung2l.c cung2r.c
-   cungbr.c cunghr.c cungl2.c cunglq.c cungql.c cungqr.c cungr2.c
-   cungrq.c cungtr.c cunm2l.c cunm2r.c cunmbr.c cunmhr.c cunml2.c cunm22.c
-   cunmlq.c cunmql.c cunmqr.c cunmr2.c cunmr3.c cunmrq.c cunmrz.c
-   cunmtr.c cupgtr.c cupmtr.c icmax1.c scsum1.c cstemr.c
-   chfrk.c ctfttp.c clanhf.c cpftrf.c cpftri.c cpftrs.c ctfsm.c ctftri.c
-   ctfttr.c ctpttf.c ctpttr.c ctrttf.c ctrttp.c
-   cgeequb.c cgbequb.c csyequb.c cpoequb.c cheequb.c
-   cbbcsd.c clapmr.c cunbdb.c cunbdb1.c cunbdb2.c cunbdb3.c cunbdb4.c
-   cunbdb5.c cunbdb6.c cuncsd.c cuncsd2by1.c
-   cgeqrt.c cgeqrt2.c cgeqrt3.c cgemqrt.c
-   ctpqrt.c ctpqrt2.c ctpmqrt.c ctprfb.c
-   cgelqt.c cgelqt3.c cgemlqt.c
-   cgetsls.c cgetsqrhrt.c cgeqr.c clatsqr.c clamtsqr.c cgemqr.c
-   cgelq.c claswlq.c clamswlq.c cgemlq.c
-   ctplqt.c ctplqt2.c ctpmlqt.c
-   chetrd_2stage.c chetrd_he2hb.c chetrd_hb2st.c chb2st_kernels.c
-   cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c
-   chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
-   cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c 
-   cungtsqr.c cungtsqr_row.c cunhr_col.c 
-   clatrs3.c ctrsyl3.c)
-
-set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
-   cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
-   csysvxx.c csyrfsx.c cla_syrfsx_extended.c cla_syamv.c
-   cla_syrcond_c.c cla_syrcond_x.c cla_syrpvgrw.c
-   cposvxx.c cporfsx.c cla_porfsx_extended.c
-   cla_porcond_c.c cla_porcond_x.c cla_porpvgrw.c
-   cgbsvxx.c cgbrfsx.c cla_gbrfsx_extended.c cla_gbamv.c
-   cla_gbrcond_c.c cla_gbrcond_x.c cla_gbrpvgrw.c
-   chesvxx.c cherfsx.c cla_herfsx_extended.c cla_heamv.c
-   cla_hercond_c.c cla_hercond_x.c cla_herpvgrw.c
-   cla_lin_berr.c clarscl2.c clascl2.c cla_wwaddw.c)
-
-set(DLASRC
-   dgbbrd.c dgbcon.c dgbequ.c dgbrfs.c dgbsv.c
-   dgbsvx.c dgbtf2.c dgbtrf.c dgbtrs.c dgebak.c dgebal.c dgebd2.c
-   dgebrd.c dgecon.c dgeequ.c dgees.c  dgeesx.c dgeev.c  dgeevx.c
-   dgehd2.c dgehrd.c dgelq2.c dgelqf.c
-   dgels.c  dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c
-   dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
-   dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c
-   dgetrf2.c dgetri.c
-   dggbak.c dggbal.c
-   dgges.c  dgges3.c dggesx.c dggev.c  dggev3.c dggevx.c
-   dggglm.c dgghrd.c dgghd3.c dgglse.c dggqrf.c
-   dggrqf.c dggsvd3.c dggsvp3.c dgtcon.c dgtrfs.c dgtsv.c
-   dgtsvx.c dgttrf.c dgttrs.c dgtts2.c dhgeqz.c
-   dhsein.c dhseqr.c dlabrd.c dlacon.c dlacn2.c
-   dlaein.c dlaexc.c dlag2.c  dlags2.c dlagtm.c dlagv2.c dlahqr.c
-   dlahr2.c dlaic1.c dlaln2.c dlals0.c dlalsa.c dlalsd.c
-   dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c
-   dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c
-   dlapll.c dlapmt.c
-   dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
-   dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
-   dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
-   dlarf.c  dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
-   dlargv.c dlarrv.c dlartv.c
-   dlarz.c  dlarzb.c dlarzt.c dlasy2.c
-   dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c
-   dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c
-   dopgtr.c dopmtr.c dorg2l.c dorg2r.c
-   dorgbr.c dorghr.c dorgl2.c dorglq.c dorgql.c dorgqr.c dorgr2.c
-   dorgrq.c dorgtr.c dorm2l.c dorm2r.c dorm22.c
-   dormbr.c dormhr.c dorml2.c dormlq.c dormql.c dormqr.c dormr2.c
-   dormr3.c dormrq.c dormrz.c dormtr.c dpbcon.c dpbequ.c dpbrfs.c
-   dpbstf.c dpbsv.c  dpbsvx.c
-   dpbtf2.c dpbtrf.c dpbtrs.c dpocon.c dpoequ.c dporfs.c dposv.c
-   dposvx.c dpotrf2.c dpotri.c dpotrs.c dpstrf.c dpstf2.c
-   dppcon.c dppequ.c
-   dpprfs.c dppsv.c  dppsvx.c dpptrf.c dpptri.c dpptrs.c dptcon.c
-   dpteqr.c dptrfs.c dptsv.c  dptsvx.c dpttrs.c dptts2.c drscl.c
-   dsbev.c  dsbevd.c dsbevx.c dsbgst.c dsbgv.c  dsbgvd.c dsbgvx.c
-   dsbtrd.c dspcon.c dspev.c  dspevd.c dspevx.c dspgst.c
-   dspgv.c  dspgvd.c dspgvx.c dsprfs.c dspsv.c  dspsvx.c dsptrd.c
-   dsptrf.c dsptri.c dsptrs.c dstegr.c dstev.c  dstevd.c dstevr.c
-   dsycon.c dsyev.c  dsyevd.c dsyevr.c
-   dsyevx.c dsygs2.c dsygst.c dsygv.c  dsygvd.c dsygvx.c dsyrfs.c
-   dsysv.c  dsysvx.c
-   dsytd2.c dsytf2.c dsytrd.c dsytrf.c dsytri.c dsytrs.c dsytrs2.c
-   dsytri2.c dsytri2x.c dsyswapr.c
-   dsyconv.c dsyconvf.c dsyconvf_rook.c
-   dsytf2_rook.c dsytrf_rook.c dsytrs_rook.c
-   dsytri_rook.c dsycon_rook.c dsysv_rook.c
-   dsytf2_rk.c dsytrf_rk.c dsytrs_3.c
-   dsytri_3.c dsytri_3x.c dsycon_3.c dsysv_rk.c
-   dsysv_aa.c dsysv_aa_2stage.c dsytrf_aa.c dsytrf_aa_2stage.c dsytrs_aa.c dsytrs_aa_2stage.c
-   dtbcon.c
-   dtbrfs.c dtbtrs.c dtgevc.c dtgex2.c dtgexc.c dtgsen.c
-   dtgsja.c dtgsna.c dtgsy2.c dtgsyl.c dtpcon.c dtprfs.c dtptri.c
-   dtptrs.c
-   dtrcon.c dtrevc.c dtrevc3.c dtrexc.c dtrrfs.c dtrsen.c dtrsna.c dtrsyl.c
-   dtrtrs.c dtzrzf.c dstemr.c
-   dsgesv.c dsposv.c dlag2s.c slag2d.c dlat2s.c
-   dlansf.c dpftrf.c dpftri.c dpftrs.c dsfrk.c dtfsm.c dtftri.c dtfttp.c
-   dtfttr.c dtpttf.c dtpttr.c dtrttf.c dtrttp.c
-   dgejsv.c dgesvj.c dgsvj0.c dgsvj1.c
-   dgeequb.c dsyequb.c dpoequb.c dgbequb.c
-   dbbcsd.c dlapmr.c dorbdb.c dorbdb1.c dorbdb2.c dorbdb3.c dorbdb4.c
-   dorbdb5.c dorbdb6.c dorcsd.c dorcsd2by1.c
-   dgeqrt.c dgeqrt2.c dgeqrt3.c dgemqrt.c
-   dtpqrt.c dtpqrt2.c dtpmqrt.c dtprfb.c
-   dgelqt.c dgelqt3.c dgemlqt.c
-   dgetsls.c dgetsqrhrt.c dgeqr.c dlatsqr.c dlamtsqr.c dgemqr.c
-   dgelq.c dlaswlq.c dlamswlq.c dgemlq.c
-   dtplqt.c dtplqt2.c dtpmlqt.c
-   dsytrd_2stage.c dsytrd_sy2sb.c dsytrd_sb2st.c dsb2st_kernels.c
-   dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c
-   dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
-   dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
-   dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c 
-   dlarmm.c dlatrs3.c dtrsyl3.c)
-
-set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
-   dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
-   dla_syrfsx_extended.c dla_syamv.c dla_syrcond.c dla_syrpvgrw.c
-   dposvxx.c dporfsx.c dla_porfsx_extended.c dla_porcond.c
-   dla_porpvgrw.c dgbsvxx.c dgbrfsx.c dla_gbrfsx_extended.c
-   dla_gbamv.c dla_gbrcond.c dla_gbrpvgrw.c dla_lin_berr.c dlarscl2.c
-   dlascl2.c dla_wwaddw.c)
-
-set(ZLASRC
-   zbdsqr.c zgbbrd.c zgbcon.c zgbequ.c zgbrfs.c zgbsv.c  zgbsvx.c
-   zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c
-   zgecon.c zgeequ.c zgees.c  zgeesx.c zgeev.c  zgeevx.c
-   zgehd2.c zgehrd.c zgelq2.c zgelqf.c
-   zgels.c  zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c
-   zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c
-   zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c
-   zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c
-   zgetc2.c zgetrf2.c
-   zgetri.c
-   zggbak.c zggbal.c
-   zgges.c  zgges3.c zggesx.c zggev.c  zggev3.c zggevx.c
-   zggglm.c zgghrd.c zgghd3.c zgglse.c zggqrf.c zggrqf.c
-   zggsvd3.c zggsvp3.c
-   zgtcon.c zgtrfs.c zgtsv.c  zgtsvx.c zgttrf.c zgttrs.c zgtts2.c zhbev.c
-   zhbevd.c zhbevx.c zhbgst.c zhbgv.c  zhbgvd.c zhbgvx.c zhbtrd.c
-   zhecon.c zheev.c  zheevd.c zheevr.c zheevx.c zhegs2.c zhegst.c
-   zhegv.c  zhegvd.c zhegvx.c zherfs.c zhesv.c  zhesvx.c zhetd2.c
-   zhetf2.c zhetrd.c
-   zhetrf.c zhetri.c zhetri2.c zhetri2x.c zheswapr.c
-   zhetrs.c zhetrs2.c
-   zhetf2_rook.c zhetrf_rook.c zhetri_rook.c
-   zhetrs_rook.c zhecon_rook.c zhesv_rook.c
-   zhetf2_rk.c zhetrf_rk.c zhetri_3.c zhetri_3x.c
-   zhetrs_3.c zhecon_3.c zhesv_rk.c
-   zhesv_aa.c zhesv_aa_2stage.c zhetrf_aa.c zhetrf_aa_2stage.c zhetrs_aa.c zhetrs_aa_2stage.c
-   zhgeqz.c zhpcon.c zhpev.c  zhpevd.c
-   zhpevx.c zhpgst.c zhpgv.c  zhpgvd.c zhpgvx.c zhprfs.c zhpsv.c
-   zhpsvx.c
-   zhptrd.c zhptrf.c zhptri.c zhptrs.c zhsein.c zhseqr.c zlabrd.c
-   zlacgv.c zlacon.c zlacn2.c zlacp2.c zlacpy.c zlacrm.c zlacrt.c zladiv.c
-   zlaed0.c zlaed7.c zlaed8.c
-   zlaein.c zlaesy.c zlaev2.c zlags2.c zlagtm.c
-   zlahef.c zlahef_rook.c zlahef_rk.c zlahef_aa.c zlahqr.c
-   zlahr2.c zlaic1.c zlals0.c zlalsa.c zlalsd.c zlangb.c zlange.c
-   zlangt.c zlanhb.c
-   zlanhe.c
-   zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c
-   zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c
-   zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c
-   zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
-   zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
-   zlarcm.c zlarf.c  zlarfb.c zlarfb_gett.c
-   zlarfg.c zlarfgp.c zlarft.c
-   zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c
-   zlarz.c  zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c
-   zlassq.c zlasyf.c zlasyf_rook.c zlasyf_rk.c zlasyf_aa.c
-   zlatbs.c zlatdf.c zlatps.c zlatrd.c zlatrs.c zlatrz.c
-   zpbcon.c zpbequ.c zpbrfs.c zpbstf.c zpbsv.c
-   zpbsvx.c zpbtf2.c zpbtrf.c zpbtrs.c zpocon.c zpoequ.c zporfs.c
-   zposv.c  zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
-   zppcon.c zppequ.c zpprfs.c zppsv.c  zppsvx.c zpptrf.c zpptri.c zpptrs.c
-   zptcon.c zpteqr.c zptrfs.c zptsv.c  zptsvx.c zpttrf.c zpttrs.c zptts2.c
-   zrot.c   zspcon.c zsprfs.c zspsv.c
-   zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
-   zstegr.c zstein.c zsteqr.c zsycon.c
-   zsyrfs.c zsysv.c  zsysvx.c zsytf2.c zsytrf.c zsytri.c
-   zsytri2.c zsytri2x.c zsyswapr.c
-   zsytrs.c zsytrs2.c
-   zsyconv.c zsyconvf.c zsyconvf_rook.c
-   zsytf2_rook.c zsytrf_rook.c zsytrs_rook.c zsytrs_aa.c zsytrs_aa_2stage.c
-   zsytri_rook.c zsycon_rook.c zsysv_rook.c
-   zsytf2_rk.c zsytrf_rk.c zsytrf_aa.c zsytrf_aa_2stage.c zsytrs_3.c
-   zsytri_3.c zsytri_3x.c zsycon_3.c zsysv_rk.c zsysv_aa.c zsysv_aa_2stage.c
-   ztbcon.c ztbrfs.c ztbtrs.c ztgevc.c ztgex2.c
-   ztgexc.c ztgsen.c ztgsja.c ztgsna.c ztgsy2.c ztgsyl.c ztpcon.c
-   ztprfs.c ztptri.c
-   ztptrs.c ztrcon.c ztrevc.c ztrevc3.c ztrexc.c ztrrfs.c ztrsen.c ztrsna.c
-   ztrsyl.c ztrtrs.c ztzrzf.c zung2l.c
-   zung2r.c zungbr.c zunghr.c zungl2.c zunglq.c zungql.c zungqr.c zungr2.c
-   zungrq.c zungtr.c zunm2l.c zunm2r.c zunmbr.c zunmhr.c zunml2.c zunm22.c
-   zunmlq.c zunmql.c zunmqr.c zunmr2.c zunmr3.c zunmrq.c zunmrz.c
-   zunmtr.c zupgtr.c
-   zupmtr.c izmax1.c dzsum1.c zstemr.c
-   zcgesv.c zcposv.c zlag2c.c clag2z.c zlat2c.c
-   zhfrk.c ztfttp.c zlanhf.c zpftrf.c zpftri.c zpftrs.c ztfsm.c ztftri.c
-   ztfttr.c ztpttf.c ztpttr.c ztrttf.c ztrttp.c
-   zgeequb.c zgbequb.c zsyequb.c zpoequb.c zheequb.c
-   zbbcsd.c zlapmr.c zunbdb.c zunbdb1.c zunbdb2.c zunbdb3.c zunbdb4.c
-   zunbdb5.c zunbdb6.c zuncsd.c zuncsd2by1.c
-   zgeqrt.c zgeqrt2.c zgeqrt3.c zgemqrt.c
-   ztpqrt.c ztpqrt2.c ztpmqrt.c ztprfb.c
-   ztplqt.c ztplqt2.c ztpmlqt.c
-   zgelqt.c zgelqt3.c zgemlqt.c
-   zgetsls.c zgetsqrhrt.c zgeqr.c zlatsqr.c zlamtsqr.c zgemqr.c
-   zgelq.c zlaswlq.c zlamswlq.c zgemlq.c
-   zhetrd_2stage.c zhetrd_he2hb.c zhetrd_hb2st.c zhb2st_kernels.c
-   zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
-   zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
-   zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
-   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c)
-
-set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
-   zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
-   zla_syrfsx_extended.c zla_syamv.c zla_syrcond_c.c zla_syrcond_x.c
-   zla_syrpvgrw.c zposvxx.c zporfsx.c zla_porfsx_extended.c
-   zla_porcond_c.c zla_porcond_x.c zla_porpvgrw.c zgbsvxx.c zgbrfsx.c
-   zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c
-   zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c
-   zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c
-   zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c)
-
-
-if(USE_XBLAS)
-  set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
-endif()
-
-list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
-  DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
-  DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
-list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c
-  DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c
-  DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c)
-list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c
-  DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c
-  DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c)
-list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
-  DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
-  DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
-message(STATUS "Building deprecated routines")
-
-set(DSLASRC spotrs.c)
-
-set(ZCLASRC cpotrs.c)
-
-set(SCATGEN slatm1.c slaran.c slarnd.c)
-
-set(SMATGEN slatms.c slatme.c slatmr.c slatmt.c
-   slagge.c slagsy.c slakf2.c slarge.c slaror.c slarot.c slatm2.c
-   slatm3.c slatm5.c slatm6.c slatm7.c slahilb.c)
-
-set(CMATGEN clatms.c clatme.c clatmr.c clatmt.c
-   clagge.c claghe.c clagsy.c clakf2.c clarge.c claror.c clarot.c
-   clatm1.c clarnd.c clatm2.c clatm3.c clatm5.c clatm6.c clahilb.c slatm7.c)
-
-set(DZATGEN dlatm1.c dlaran.c dlarnd.c)
-
-set(DMATGEN dlatms.c dlatme.c dlatmr.c dlatmt.c
-   dlagge.c dlagsy.c dlakf2.c dlarge.c dlaror.c dlarot.c dlatm2.c
-   dlatm3.c dlatm5.c dlatm6.c dlatm7.c dlahilb.c)
-
-set(ZMATGEN zlatms.c zlatme.c zlatmr.c zlatmt.c
-  zlagge.c zlaghe.c zlagsy.c zlakf2.c zlarge.c zlaror.c zlarot.c
-  zlatm1.c zlarnd.c zlatm2.c zlatm3.c zlatm5.c zlatm6.c zlahilb.c dlatm7.c)
-
-if(BUILD_SINGLE)
-  set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX})
-  set(LA_GEN_SRC ${SMATGEN} ${SCATGEN})
-  message(STATUS "Building Single Precision")
-endif()
-if(BUILD_DOUBLE)
-  set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX})
-  set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN})
-  message(STATUS "Building Double Precision")
-endif()
-if(BUILD_COMPLEX)
-  set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX})
-  SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN})
-  message(STATUS "Building Single Precision Complex")
-endif()
-if(BUILD_COMPLEX16)
-  set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX})
-  SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN})
-# for zlange/zlanhe
-  if (NOT BUILD_DOUBLE)
-    set (LA_REL_SRC ${LA_REL_SRC} dcombssq.c)
-  endif	()  
-  message(STATUS "Building Double Precision Complex")
-endif()
-
-endif()
-
-# add lapack-netlib folder to the sources
-set(LA_SOURCES "")
-foreach (LA_FILE ${LA_REL_SRC})
-  list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}")
-endforeach ()
-foreach (LA_FILE ${LA_GEN_SRC})
-  list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/TESTING/MATGEN/${LA_FILE}")
-endforeach ()
-
-if (NOT C_LAPACK)
-  set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
-  if (${F_COMPILER} STREQUAL "GFORTRAN")
-    set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize")
-  endif()
-else ()
-  set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
-endif ()
+# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files.
+if (NOT C_LAPACK)
+	message (STATUS "fortran lapack")
+set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F
+   ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f la_xisnan.F90
+   ../INSTALL/ilaver.f xerbla_array.f
+   ../INSTALL/slamch.f)
+
+set(SCLAUX
+	scombssq.f sbdsvdx.f sstevx.f sstein.f
+   la_constants.f90
+   sbdsdc.f
+   sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f  slaebz.f
+   slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f
+   slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f
+   slagts.f slamrg.f slanst.f
+   slapy2.f slapy3.f slarnv.f
+   slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f
+   slarrk.f slarrr.f slaneg.f
+   slartg.f90 slaruv.f slas2.f  slascl.f
+   slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f
+   slasd7.f slasd8.f slasda.f slasdq.f slasdt.f
+   slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f
+   slasr.f  slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f
+   ssteqr.f ssterf.f slaisnan.f sisnan.f
+   slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f
+   ../INSTALL/second_${TIMER}.f)
+
+set(DZLAUX
+   la_constants.f90
+   dbdsdc.f
+   dbdsvdx.f dstevx.f dstein.f
+   dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f  dlaebz.f
+   dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f
+   dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f
+   dlagts.f dlamrg.f dlanst.f
+   dlapy2.f dlapy3.f dlarnv.f
+   dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f
+   dlarrk.f dlarrr.f dlaneg.f
+   dlartg.f90 dlaruv.f dlas2.f  dlascl.f
+   dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f
+   dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f
+   dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
+   dlasr.f  dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f
+   dsteqr.f dsterf.f dlaisnan.f disnan.f
+   dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f
+   ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f)
+
+set(SLASRC
+   sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
+   sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
+   sgebrd.f sgecon.f sgeequ.f sgees.f  sgeesx.f sgeev.f  sgeevx.f
+   sgehd2.f sgehrd.f sgelq2.f sgelqf.f
+   sgels.f  sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f
+   sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
+   sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f
+   sgetrf2.f sgetri.f
+   sggbak.f sggbal.f
+   sgges.f  sgges3.f sggesx.f sggev.f  sggev3.f sggevx.f
+   sggglm.f sgghrd.f sgghd3.f sgglse.f sggqrf.f
+   sggrqf.f sggsvd3.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f
+   sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
+   shsein.f shseqr.f slabrd.f slacon.f slacn2.f
+   slaqz0.f slaqz1.f slaqz2.f slaqz3.f slaqz4.f
+   slaein.f slaexc.f slag2.f  slags2.f slagtm.f slagv2.f slahqr.f
+   slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
+   slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
+   slansy.f slantb.f slantp.f slantr.f slanv2.f
+   slapll.f slapmt.f
+   slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
+   slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
+   slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
+   slarf.f  slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
+   slarrv.f slartv.f
+   slarz.f  slarzb.f slarzt.f slasy2.f
+   slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
+   slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f
+   sopgtr.f sopmtr.f sorg2l.f sorg2r.f
+   sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
+   sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f
+   sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f
+   sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f
+   spbstf.f spbsv.f  spbsvx.f
+   spbtf2.f spbtrf.f spbtrs.f spocon.f spoequ.f sporfs.f sposv.f
+   sposvx.f spotrf2.f spotri.f spstrf.f spstf2.f
+   sppcon.f sppequ.f
+   spprfs.f sppsv.f  sppsvx.f spptrf.f spptri.f spptrs.f sptcon.f
+   spteqr.f sptrfs.f sptsv.f  sptsvx.f spttrs.f sptts2.f srscl.f
+   ssbev.f  ssbevd.f ssbevx.f ssbgst.f ssbgv.f  ssbgvd.f ssbgvx.f
+   ssbtrd.f sspcon.f sspev.f  sspevd.f sspevx.f sspgst.f
+   sspgv.f  sspgvd.f sspgvx.f ssprfs.f sspsv.f  sspsvx.f ssptrd.f
+   ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f  sstevd.f sstevr.f
+   ssycon.f ssyev.f  ssyevd.f ssyevr.f ssyevx.f ssygs2.f
+   ssygst.f ssygv.f  ssygvd.f ssygvx.f ssyrfs.f ssysv.f  ssysvx.f
+   ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f
+   ssyswapr.f ssytrs.f ssytrs2.f
+   ssyconv.f ssyconvf.f ssyconvf_rook.f
+   ssysv_aa.f ssysv_aa_2stage.f ssytrf_aa.f ssytrf_aa_2stage.f ssytrs_aa.f ssytrs_aa_2stage.f
+   ssytf2_rook.f ssytrf_rook.f ssytrs_rook.f
+   ssytri_rook.f ssycon_rook.f ssysv_rook.f
+   ssytf2_rk.f ssytrf_rk.f ssytrs_3.f
+   ssytri_3.f ssytri_3x.f ssycon_3.f ssysv_rk.f
+   ssysv_aa.f ssytrf_aa.f ssytrs_aa.f
+   stbcon.f
+   stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f
+   stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f
+   stptrs.f
+   strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
+   strtrs.f stzrzf.f sstemr.f
+   slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f
+   stfttr.f stpttf.f stpttr.f strttf.f strttp.f
+   sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f
+   sgeequb.f ssyequb.f spoequb.f sgbequb.f
+   sbbcsd.f slapmr.f sorbdb.f sorbdb1.f sorbdb2.f sorbdb3.f sorbdb4.f
+   sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f
+   sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
+   stpqrt.f stpqrt2.f stpmqrt.f stprfb.f
+   sgelqt.f sgelqt3.f sgemlqt.f
+   sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
+   sgelq.f slaswlq.f slamswlq.f sgemlq.f
+   stplqt.f stplqt2.f stpmlqt.f
+   ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
+   ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
+   ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
+   sgesvdq.f slaorhr_col_getrfnp.f
+   slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f 
+   slarmm.f slatrs3.f strsyl3.f)
+
+set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
+   sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
+   sla_syrfsx_extended.f sla_syamv.f sla_syrcond.f sla_syrpvgrw.f
+   sposvxx.f sporfsx.f sla_porfsx_extended.f sla_porcond.f
+   sla_porpvgrw.f sgbsvxx.f sgbrfsx.f sla_gbrfsx_extended.f
+   sla_gbamv.f sla_gbrcond.f sla_gbrpvgrw.f sla_lin_berr.f slarscl2.f
+   slascl2.f sla_wwaddw.f)
+
+set(CLASRC
+   cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f  cgbsvx.f
+   cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
+   cgecon.f cgeequ.f cgees.f  cgeesx.f cgeev.f  cgeevx.f
+   cgehd2.f cgehrd.f cgelq2.f cgelqf.f
+   cgels.f  cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
+   cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f
+   cgesc2.f cgesdd.f cgesvd.f cgesvdx.f
+   cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f
+   cgesvx.f cgetc2.f cgetrf2.f
+   cgetri.f
+   cggbak.f cggbal.f
+   cgges.f  cgges3.f cggesx.f cggev.f  cggev3.f cggevx.f
+   cggglm.f cgghrd.f cgghd3.f cgglse.f cggqrf.f cggrqf.f
+   cggsvd3.f cggsvp3.f
+   cgtcon.f cgtrfs.f cgtsv.f  cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f
+   chbevd.f chbevx.f chbgst.f chbgv.f  chbgvd.f chbgvx.f chbtrd.f
+   checon.f cheev.f  cheevd.f cheevr.f cheevx.f chegs2.f chegst.f
+   chegv.f  chegvd.f chegvx.f cherfs.f chesv.f  chesvx.f chetd2.f
+   chetf2.f chetrd.f
+   chetrf.f chetri.f chetri2.f chetri2x.f cheswapr.f
+   chetrs.f chetrs2.f
+   chetf2_rook.f chetrf_rook.f chetri_rook.f
+   chetrs_rook.f checon_rook.f chesv_rook.f
+   chetf2_rk.f chetrf_rk.f chetri_3.f chetri_3x.f
+   chetrs_3.f checon_3.f chesv_rk.f
+   chesv_aa.f chesv_aa_2stage.f chetrf_aa.f chetrf_aa_2stage.f chetrs_aa.f chetrs_aa_2stage.f
+   chgeqz.f chpcon.f chpev.f  chpevd.f
+   chpevx.f chpgst.f chpgv.f  chpgvd.f chpgvx.f chprfs.f chpsv.f
+   chpsvx.f
+   chptrd.f chptrf.f chptri.f chptrs.f chsein.f chseqr.f clabrd.f
+   clacgv.f clacon.f clacn2.f clacp2.f clacpy.f clacrm.f clacrt.f cladiv.f
+   claed0.f claed7.f claed8.f
+   claein.f claesy.f claev2.f clags2.f clagtm.f
+   clahef.f clahef_rook.f clahef_rk.f clahef_aa.f clahqr.f
+   clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f
+   clanhb.f clanhe.f
+   clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
+   clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
+   claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
+   claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
+   claqz0.f claqz1.f claqz2.f claqz3.f
+   claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
+   clarf.f  clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
+   clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
+   clarz.f  clarzb.f clarzt.f clascl.f claset.f clasr.f  classq.f90
+   clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
+   clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f
+   cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
+   cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f
+   cposv.f  cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
+   cppcon.f cppequ.f cpprfs.f cppsv.f  cppsvx.f cpptrf.f cpptri.f cpptrs.f
+   cptcon.f cpteqr.f cptrfs.f cptsv.f  cptsvx.f cpttrf.f cpttrs.f cptts2.f
+   crot.f   cspcon.f csprfs.f cspsv.f
+   cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
+   cstegr.f cstein.f csteqr.f csycon.f
+   csyrfs.f csysv.f  csysvx.f csytf2.f csytrf.f csytri.f
+   csytri2.f csytri2x.f csyswapr.f
+   csytrs.f csytrs2.f
+   csyconv.f csyconvf.f csyconvf_rook.f
+   csytf2_rook.f csytrf_rook.f csytrs_rook.f
+   csytri_rook.f csycon_rook.f csysv_rook.f
+   csytf2_rk.f csytrf_rk.f csytrf_aa.f csytrf_aa_2stage.f csytrs_3.f csytrs_aa.f csytrs_aa_2stage.f
+   csytri_3.f csytri_3x.f csycon_3.f csysv_rk.f csysv_aa.f csysv_aa_2stage.f
+   ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f
+   ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f
+   ctprfs.f ctptri.f
+   ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
+   ctrsyl.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f
+   cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
+   cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f
+   cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f
+   cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f
+   chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f
+   ctfttr.f ctpttf.f ctpttr.f ctrttf.f ctrttp.f
+   cgeequb.f cgbequb.f csyequb.f cpoequb.f cheequb.f
+   cbbcsd.f clapmr.f cunbdb.f cunbdb1.f cunbdb2.f cunbdb3.f cunbdb4.f
+   cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f
+   cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
+   ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f
+   cgelqt.f cgelqt3.f cgemlqt.f
+   cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
+   cgelq.f claswlq.f clamswlq.f cgemlq.f
+   ctplqt.f ctplqt2.f ctpmlqt.f
+   chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
+   cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
+   chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
+   cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
+   cungtsqr.f cungtsqr_row.f cunhr_col.f 
+   clatrs3.f ctrsyl3.f )
+
+set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
+   cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
+   csysvxx.f csyrfsx.f cla_syrfsx_extended.f cla_syamv.f
+   cla_syrcond_c.f cla_syrcond_x.f cla_syrpvgrw.f
+   cposvxx.f cporfsx.f cla_porfsx_extended.f
+   cla_porcond_c.f cla_porcond_x.f cla_porpvgrw.f
+   cgbsvxx.f cgbrfsx.f cla_gbrfsx_extended.f cla_gbamv.f
+   cla_gbrcond_c.f cla_gbrcond_x.f cla_gbrpvgrw.f
+   chesvxx.f cherfsx.f cla_herfsx_extended.f cla_heamv.f
+   cla_hercond_c.f cla_hercond_x.f cla_herpvgrw.f
+   cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f)
+
+set(DLASRC
+   dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f
+   dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f
+   dgebrd.f dgecon.f dgeequ.f dgees.f  dgeesx.f dgeev.f  dgeevx.f
+   dgehd2.f dgehrd.f dgelq2.f dgelqf.f
+   dgels.f  dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f
+   dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
+   dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f
+   dgetrf2.f dgetri.f
+   dggbak.f dggbal.f
+   dgges.f  dgges3.f dggesx.f dggev.f  dggev3.f dggevx.f
+   dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f
+   dggrqf.f dggsvd3.f dggsvp3.f dgtcon.f dgtrfs.f dgtsv.f
+   dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
+   dlaqz0.f dlaqz1.f dlaqz2.f dlaqz3.f dlaqz4.f
+   dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
+   dlaein.f dlaexc.f dlag2.f  dlags2.f dlagtm.f dlagv2.f dlahqr.f
+   dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
+   dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
+   dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
+   dlapll.f dlapmt.f
+   dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
+   dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
+   dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
+   dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
+   dlargv.f dlarrv.f dlartv.f
+   dlarz.f  dlarzb.f dlarzt.f dlasy2.f
+   dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
+   dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f
+   dopgtr.f dopmtr.f dorg2l.f dorg2r.f
+   dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
+   dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f
+   dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f
+   dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f
+   dpbstf.f dpbsv.f  dpbsvx.f
+   dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f
+   dposvx.f dpotrf2.f dpotri.f dpotrs.f dpstrf.f dpstf2.f
+   dppcon.f dppequ.f
+   dpprfs.f dppsv.f  dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f
+   dpteqr.f dptrfs.f dptsv.f  dptsvx.f dpttrs.f dptts2.f drscl.f
+   dsbev.f  dsbevd.f dsbevx.f dsbgst.f dsbgv.f  dsbgvd.f dsbgvx.f
+   dsbtrd.f dspcon.f dspev.f  dspevd.f dspevx.f dspgst.f
+   dspgv.f  dspgvd.f dspgvx.f dsprfs.f dspsv.f  dspsvx.f dsptrd.f
+   dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f  dstevd.f dstevr.f
+   dsycon.f dsyev.f  dsyevd.f dsyevr.f
+   dsyevx.f dsygs2.f dsygst.f dsygv.f  dsygvd.f dsygvx.f dsyrfs.f
+   dsysv.f  dsysvx.f
+   dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f
+   dsytri2.f dsytri2x.f dsyswapr.f
+   dsyconv.f dsyconvf.f dsyconvf_rook.f
+   dsytf2_rook.f dsytrf_rook.f dsytrs_rook.f
+   dsytri_rook.f dsycon_rook.f dsysv_rook.f
+   dsytf2_rk.f dsytrf_rk.f dsytrs_3.f
+   dsytri_3.f dsytri_3x.f dsycon_3.f dsysv_rk.f
+   dsysv_aa.f dsysv_aa_2stage.f dsytrf_aa.f dsytrf_aa_2stage.f dsytrs_aa.f dsytrs_aa_2stage.f
+   dtbcon.f
+   dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f
+   dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f
+   dtptrs.f
+   dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
+   dtrtrs.f dtzrzf.f dstemr.f
+   dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f
+   dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f
+   dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f
+   dgejsv.f dgesvj.f dgsvj0.f dgsvj1.f
+   dgeequb.f dsyequb.f dpoequb.f dgbequb.f
+   dbbcsd.f dlapmr.f dorbdb.f dorbdb1.f dorbdb2.f dorbdb3.f dorbdb4.f
+   dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f
+   dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
+   dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f
+   dgelqt.f dgelqt3.f dgemlqt.f
+   dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
+   dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
+   dtplqt.f dtplqt2.f dtpmlqt.f
+   dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
+   dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
+   dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
+   dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
+   dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f 
+   dlarmm.f dlatrs3.f dtrsyl3.f)
+
+set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
+   dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
+   dla_syrfsx_extended.f dla_syamv.f dla_syrcond.f dla_syrpvgrw.f
+   dposvxx.f dporfsx.f dla_porfsx_extended.f dla_porcond.f
+   dla_porpvgrw.f dgbsvxx.f dgbrfsx.f dla_gbrfsx_extended.f
+   dla_gbamv.f dla_gbrcond.f dla_gbrpvgrw.f dla_lin_berr.f dlarscl2.f
+   dlascl2.f dla_wwaddw.f)
+
+set(ZLASRC
+   zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f  zgbsvx.f
+   zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
+   zgecon.f zgeequ.f zgees.f  zgeesx.f zgeev.f  zgeevx.f
+   zgehd2.f zgehrd.f zgelq2.f zgelqf.f
+   zgels.f  zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
+   zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
+   zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f
+   zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f
+   zgetc2.f zgetrf2.f
+   zgetri.f
+   zggbak.f zggbal.f
+   zgges.f  zgges3.f zggesx.f zggev.f  zggev3.f zggevx.f
+   zggglm.f zgghrd.f zgghd3.f zgglse.f zggqrf.f zggrqf.f
+   zggsvd3.f zggsvp3.f
+   zgtcon.f zgtrfs.f zgtsv.f  zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
+   zhbevd.f zhbevx.f zhbgst.f zhbgv.f  zhbgvd.f zhbgvx.f zhbtrd.f
+   zhecon.f zheev.f  zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f
+   zhegv.f  zhegvd.f zhegvx.f zherfs.f zhesv.f  zhesvx.f zhetd2.f
+   zhetf2.f zhetrd.f
+   zhetrf.f zhetri.f zhetri2.f zhetri2x.f zheswapr.f
+   zhetrs.f zhetrs2.f
+   zhetf2_rook.f zhetrf_rook.f zhetri_rook.f
+   zhetrs_rook.f zhecon_rook.f zhesv_rook.f
+   zhetf2_rk.f zhetrf_rk.f zhetri_3.f zhetri_3x.f
+   zhetrs_3.f zhecon_3.f zhesv_rk.f
+   zhesv_aa.f zhesv_aa_2stage.f zhetrf_aa.f zhetrf_aa_2stage.f zhetrs_aa.f zhetrs_aa_2stage.f
+   zhgeqz.f zhpcon.f zhpev.f  zhpevd.f
+   zlaqz0.f zlaqz1.f zlaqz2.f zlaqz3.f
+   zhpevx.f zhpgst.f zhpgv.f  zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f
+   zhpsvx.f
+   zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f
+   zlacgv.f zlacon.f zlacn2.f zlacp2.f zlacpy.f zlacrm.f zlacrt.f zladiv.f
+   zlaed0.f zlaed7.f zlaed8.f
+   zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f
+   zlahef.f zlahef_rook.f zlahef_rk.f zlahef_aa.f zlahqr.f
+   zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f
+   zlangt.f zlanhb.f
+   zlanhe.f
+   zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
+   zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f
+   zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
+   zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
+   zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
+   zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f
+   zlarfg.f zlarfgp.f zlarft.f
+   zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
+   zlarz.f  zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
+   zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f
+   zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f
+   zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
+   zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
+   zposv.f  zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
+   zppcon.f zppequ.f zpprfs.f zppsv.f  zppsvx.f zpptrf.f zpptri.f zpptrs.f
+   zptcon.f zpteqr.f zptrfs.f zptsv.f  zptsvx.f zpttrf.f zpttrs.f zptts2.f
+   zrot.f   zspcon.f zsprfs.f zspsv.f
+   zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
+   zstegr.f zstein.f zsteqr.f zsycon.f
+   zsyrfs.f zsysv.f  zsysvx.f zsytf2.f zsytrf.f zsytri.f
+   zsytri2.f zsytri2x.f zsyswapr.f
+   zsytrs.f zsytrs2.f
+   zsyconv.f zsyconvf.f zsyconvf_rook.f
+   zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f zsytrs_aa.f zsytrs_aa_2stage.f
+   zsytri_rook.f zsycon_rook.f zsysv_rook.f
+   zsytf2_rk.f zsytrf_rk.f zsytrf_aa.f zsytrf_aa_2stage.f zsytrs_3.f
+   zsytri_3.f zsytri_3x.f zsycon_3.f zsysv_rk.f zsysv_aa.f zsysv_aa_2stage.f
+   ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f
+   ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f
+   ztprfs.f ztptri.f
+   ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
+   ztrsyl.f ztrtrs.f ztzrzf.f zung2l.f
+   zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
+   zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f
+   zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f
+   zunmtr.f zupgtr.f
+   zupmtr.f izmax1.f dzsum1.f zstemr.f
+   zcgesv.f zcposv.f zlag2c.f clag2z.f zlat2c.f
+   zhfrk.f ztfttp.f zlanhf.f zpftrf.f zpftri.f zpftrs.f ztfsm.f ztftri.f
+   ztfttr.f ztpttf.f ztpttr.f ztrttf.f ztrttp.f
+   zgeequb.f zgbequb.f zsyequb.f zpoequb.f zheequb.f
+   zbbcsd.f zlapmr.f zunbdb.f zunbdb1.f zunbdb2.f zunbdb3.f zunbdb4.f
+   zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f
+   zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f
+   ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f
+   ztplqt.f ztplqt2.f ztpmlqt.f
+   zgelqt.f zgelqt3.f zgemlqt.f
+   zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
+   zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
+   zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
+   zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
+   zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
+   zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
+   zungtsqr.f zungtsqr_row.f zunhr_col.f
+   zlatrs3.f ztrsyl3.f)
+
+set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
+   zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
+   zla_syrfsx_extended.f zla_syamv.f zla_syrcond_c.f zla_syrcond_x.f
+   zla_syrpvgrw.f zposvxx.f zporfsx.f zla_porfsx_extended.f
+   zla_porcond_c.f zla_porcond_x.f zla_porpvgrw.f zgbsvxx.f zgbrfsx.f
+   zla_gbrfsx_extended.f zla_gbamv.f zla_gbrcond_c.f zla_gbrcond_x.f
+   zla_gbrpvgrw.f zhesvxx.f zherfsx.f zla_herfsx_extended.f
+   zla_heamv.f zla_hercond_c.f zla_hercond_x.f zla_herpvgrw.f
+   zla_lin_berr.f zlarscl2.f zlascl2.f zla_wwaddw.f)
+
+
+if(USE_XBLAS)
+  set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
+endif()
+
+list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
+  DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
+  DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
+list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f
+  DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f
+  DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f)
+list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f
+  DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f
+  DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f)
+list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
+  DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
+  DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
+message(STATUS "Building deprecated routines")
+
+set(DSLASRC spotrs.f)
+
+set(ZCLASRC cpotrs.f)
+
+set(SCATGEN slatm1.f slaran.f slarnd.f)
+
+set(SMATGEN slatms.f slatme.f slatmr.f slatmt.f
+   slagge.f slagsy.f slakf2.f slarge.f slaror.f slarot.f slatm2.f
+   slatm3.f slatm5.f slatm6.f slatm7.f slahilb.f)
+
+set(CMATGEN clatms.f clatme.f clatmr.f clatmt.f
+   clagge.f claghe.f clagsy.f clakf2.f clarge.f claror.f clarot.f
+   clatm1.f clarnd.f clatm2.f clatm3.f clatm5.f clatm6.f clahilb.f slatm7.f)
+
+set(DZATGEN dlatm1.f dlaran.f dlarnd.f)
+
+set(DMATGEN dlatms.f dlatme.f dlatmr.f dlatmt.f
+   dlagge.f dlagsy.f dlakf2.f dlarge.f dlaror.f dlarot.f dlatm2.f
+   dlatm3.f dlatm5.f dlatm6.f dlatm7.f dlahilb.f)
+
+set(ZMATGEN zlatms.f zlatme.f zlatmr.f zlatmt.f
+  zlagge.f zlaghe.f zlagsy.f zlakf2.f zlarge.f zlaror.f zlarot.f
+  zlatm1.f zlarnd.f zlatm2.f zlatm3.f zlatm5.f zlatm6.f zlahilb.f dlatm7.f)
+
+if(BUILD_SINGLE)
+  set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX})
+  set(LA_GEN_SRC ${SMATGEN} ${SCATGEN})
+  message(STATUS "Building Single Precision")
+endif()
+if(BUILD_DOUBLE)
+  set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX})
+  set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN})
+  message(STATUS "Building Double Precision")
+endif()
+if(BUILD_COMPLEX)
+  set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX})
+  SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN})
+  message(STATUS "Building Single Precision Complex")
+endif()
+if(BUILD_COMPLEX16)
+  set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX})
+  SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN})
+# for zlange/zlanhe
+  if (NOT BUILD_DOUBLE)
+    set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f)
+  endif	()  
+  message(STATUS "Building Double Precision Complex")
+endif()
+
+else ()
+
+	message (STATUS "c lapack")
+set(ALLAUX ilaenv.c ilaenv2stage.c ieeeck.c lsamen.c iparmq.c iparam2stage.c
+   ilaprec.c ilatrans.c ilauplo.c iladiag.c chla_transtype.c dlaset.c
+   ../INSTALL/ilaver.c xerbla_array.c
+   ../INSTALL/slamch.c)
+
+set(SCLAUX
+	scombssq.c sbdsvdx.c sstevx.c sstein.c
+   sbdsdc.c
+   sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c  slaebz.c
+   slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c
+   slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c
+   slagts.c slamrg.c slanst.c
+   slapy2.c slapy3.c slarnv.c
+   slarra.c slarrb.c slarrc.c slarrd.c slarre.c slarrf.c slarrj.c
+   slarrk.c slarrr.c slaneg.c
+   slartg.c slaruv.c slas2.c  slascl.c
+   slasd0.c slasd1.c slasd2.c slasd3.c slasd4.c slasd5.c slasd6.c
+   slasd7.c slasd8.c slasda.c slasdq.c slasdt.c
+   slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c
+   slasr.c  slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c
+   ssteqr.c ssterf.c slaisnan.c sisnan.c
+   slartgp.c slartgs.c
+   ../INSTALL/second_${TIMER}.c)
+
+set(DZLAUX
+   dbdsdc.c
+   dbdsvdx.c dstevx.c dstein.c
+   dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c  dlaebz.c
+   dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c
+   dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c
+   dlagts.c dlamrg.c dlanst.c
+   dlapy2.c dlapy3.c dlarnv.c
+   dlarra.c dlarrb.c dlarrc.c dlarrd.c dlarre.c dlarrf.c dlarrj.c
+   dlarrk.c dlarrr.c dlaneg.c
+   dlartg.c dlaruv.c dlas2.c  dlascl.c
+   dlasd0.c dlasd1.c dlasd2.c dlasd3.c dlasd4.c dlasd5.c dlasd6.c
+   dlasd7.c dlasd8.c dlasda.c dlasdq.c dlasdt.c
+   dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c
+   dlasr.c  dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c
+   dsteqr.c dsterf.c dlaisnan.c disnan.c
+   dlartgp.c dlartgs.c
+   ../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c)
+
+set(SLASRC
+   sgbbrd.c sgbcon.c sgbequ.c sgbrfs.c sgbsv.c
+   sgbsvx.c sgbtf2.c sgbtrf.c sgbtrs.c sgebak.c sgebal.c sgebd2.c
+   sgebrd.c sgecon.c sgeequ.c sgees.c  sgeesx.c sgeev.c  sgeevx.c
+   sgehd2.c sgehrd.c sgelq2.c sgelqf.c
+   sgels.c  sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c
+   sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
+   sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c
+   sgetrf2.c sgetri.c
+   sggbak.c sggbal.c
+   sgges.c  sgges3.c sggesx.c sggev.c  sggev3.c sggevx.c
+   sggglm.c sgghrd.c sgghd3.c sgglse.c sggqrf.c
+   sggrqf.c sggsvd3.c sggsvp3.c sgtcon.c sgtrfs.c sgtsv.c
+   sgtsvx.c sgttrf.c sgttrs.c sgtts2.c shgeqz.c
+   shsein.c shseqr.c slabrd.c slacon.c slacn2.c
+   slaein.c slaexc.c slag2.c  slags2.c slagtm.c slagv2.c slahqr.c
+   slahr2.c slaic1.c slaln2.c slals0.c slalsa.c slalsd.c
+   slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c
+   slansy.c slantb.c slantp.c slantr.c slanv2.c
+   slapll.c slapmt.c
+   slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
+   slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
+   slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
+   slarf.c  slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
+   slarrv.c slartv.c
+   slarz.c  slarzb.c slarzt.c slasy2.c
+   slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c
+   slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c
+   sopgtr.c sopmtr.c sorg2l.c sorg2r.c
+   sorgbr.c sorghr.c sorgl2.c sorglq.c sorgql.c sorgqr.c sorgr2.c
+   sorgrq.c sorgtr.c sorm2l.c sorm2r.c sorm22.c
+   sormbr.c sormhr.c sorml2.c sormlq.c sormql.c sormqr.c sormr2.c
+   sormr3.c sormrq.c sormrz.c sormtr.c spbcon.c spbequ.c spbrfs.c
+   spbstf.c spbsv.c  spbsvx.c
+   spbtf2.c spbtrf.c spbtrs.c spocon.c spoequ.c sporfs.c sposv.c
+   sposvx.c spotrf2.c spotri.c spstrf.c spstf2.c
+   sppcon.c sppequ.c
+   spprfs.c sppsv.c  sppsvx.c spptrf.c spptri.c spptrs.c sptcon.c
+   spteqr.c sptrfs.c sptsv.c  sptsvx.c spttrs.c sptts2.c srscl.c
+   ssbev.c  ssbevd.c ssbevx.c ssbgst.c ssbgv.c  ssbgvd.c ssbgvx.c
+   ssbtrd.c sspcon.c sspev.c  sspevd.c sspevx.c sspgst.c
+   sspgv.c  sspgvd.c sspgvx.c ssprfs.c sspsv.c  sspsvx.c ssptrd.c
+   ssptrf.c ssptri.c ssptrs.c sstegr.c sstev.c  sstevd.c sstevr.c
+   ssycon.c ssyev.c  ssyevd.c ssyevr.c ssyevx.c ssygs2.c
+   ssygst.c ssygv.c  ssygvd.c ssygvx.c ssyrfs.c ssysv.c  ssysvx.c
+   ssytd2.c ssytf2.c ssytrd.c ssytrf.c ssytri.c ssytri2.c ssytri2x.c
+   ssyswapr.c ssytrs.c ssytrs2.c
+   ssyconv.c ssyconvf.c ssyconvf_rook.c
+   ssysv_aa.c ssysv_aa_2stage.c ssytrf_aa.c ssytrf_aa_2stage.c ssytrs_aa.c ssytrs_aa_2stage.c
+   ssytf2_rook.c ssytrf_rook.c ssytrs_rook.c
+   ssytri_rook.c ssycon_rook.c ssysv_rook.c
+   ssytf2_rk.c ssytrf_rk.c ssytrs_3.c
+   ssytri_3.c ssytri_3x.c ssycon_3.c ssysv_rk.c
+   ssysv_aa.c ssytrf_aa.c ssytrs_aa.c
+   stbcon.c
+   stbrfs.c stbtrs.c stgevc.c stgex2.c stgexc.c stgsen.c
+   stgsja.c stgsna.c stgsy2.c stgsyl.c stpcon.c stprfs.c stptri.c
+   stptrs.c
+   strcon.c strevc.c strevc3.c strexc.c strrfs.c strsen.c strsna.c strsyl.c
+   strtrs.c stzrzf.c sstemr.c
+   slansf.c spftrf.c spftri.c spftrs.c ssfrk.c stfsm.c stftri.c stfttp.c
+   stfttr.c stpttf.c stpttr.c strttf.c strttp.c
+   sgejsv.c sgesvj.c sgsvj0.c sgsvj1.c
+   sgeequb.c ssyequb.c spoequb.c sgbequb.c
+   sbbcsd.c slapmr.c sorbdb.c sorbdb1.c sorbdb2.c sorbdb3.c sorbdb4.c
+   sorbdb5.c sorbdb6.c sorcsd.c sorcsd2by1.c
+   sgeqrt.c sgeqrt2.c sgeqrt3.c sgemqrt.c
+   stpqrt.c stpqrt2.c stpmqrt.c stprfb.c
+   sgelqt.c sgelqt3.c sgemlqt.c
+   sgetsls.c sgetsqrhrt.c sgeqr.c slatsqr.c slamtsqr.c sgemqr.c
+   sgelq.c slaswlq.c slamswlq.c sgemlq.c
+   stplqt.c stplqt2.c stpmlqt.c
+   ssytrd_2stage.c ssytrd_sy2sb.c ssytrd_sb2st.c ssb2st_kernels.c
+   ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c
+   ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
+   sgesvdq.c slaorhr_col_getrfnp.c
+   slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c 
+   slarmm.c slatrs3.c strsyl3.c)
+
+set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
+   sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
+   sla_syrfsx_extended.c sla_syamv.c sla_syrcond.c sla_syrpvgrw.c
+   sposvxx.c sporfsx.c sla_porfsx_extended.c sla_porcond.c
+   sla_porpvgrw.c sgbsvxx.c sgbrfsx.c sla_gbrfsx_extended.c
+   sla_gbamv.c sla_gbrcond.c sla_gbrpvgrw.c sla_lin_berr.c slarscl2.c
+   slascl2.c sla_wwaddw.c)
+
+set(CLASRC
+   cbdsqr.c cgbbrd.c cgbcon.c cgbequ.c cgbrfs.c cgbsv.c  cgbsvx.c
+   cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c
+   cgecon.c cgeequ.c cgees.c  cgeesx.c cgeev.c  cgeevx.c
+   cgehd2.c cgehrd.c cgelq2.c cgelqf.c
+   cgels.c  cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
+   cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
+   cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
+   cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
+   cgesvx.c cgetc2.c cgetrf2.c
+   cgetri.c
+   cggbak.c cggbal.c
+   cgges.c  cgges3.c cggesx.c cggev.c  cggev3.c cggevx.c
+   cggglm.c cgghrd.c cgghd3.c cgglse.c cggqrf.c cggrqf.c
+   cggsvd3.c cggsvp3.c
+   cgtcon.c cgtrfs.c cgtsv.c  cgtsvx.c cgttrf.c cgttrs.c cgtts2.c chbev.c
+   chbevd.c chbevx.c chbgst.c chbgv.c  chbgvd.c chbgvx.c chbtrd.c
+   checon.c cheev.c  cheevd.c cheevr.c cheevx.c chegs2.c chegst.c
+   chegv.c  chegvd.c chegvx.c cherfs.c chesv.c  chesvx.c chetd2.c
+   chetf2.c chetrd.c
+   chetrf.c chetri.c chetri2.c chetri2x.c cheswapr.c
+   chetrs.c chetrs2.c
+   chetf2_rook.c chetrf_rook.c chetri_rook.c
+   chetrs_rook.c checon_rook.c chesv_rook.c
+   chetf2_rk.c chetrf_rk.c chetri_3.c chetri_3x.c
+   chetrs_3.c checon_3.c chesv_rk.c
+   chesv_aa.c chesv_aa_2stage.c chetrf_aa.c chetrf_aa_2stage.c chetrs_aa.c chetrs_aa_2stage.c
+   chgeqz.c chpcon.c chpev.c  chpevd.c
+   chpevx.c chpgst.c chpgv.c  chpgvd.c chpgvx.c chprfs.c chpsv.c
+   chpsvx.c
+   chptrd.c chptrf.c chptri.c chptrs.c chsein.c chseqr.c clabrd.c
+   clacgv.c clacon.c clacn2.c clacp2.c clacpy.c clacrm.c clacrt.c cladiv.c
+   claed0.c claed7.c claed8.c
+   claein.c claesy.c claev2.c clags2.c clagtm.c
+   clahef.c clahef_rook.c clahef_rk.c clahef_aa.c clahqr.c
+   clahr2.c claic1.c clals0.c clalsa.c clalsd.c clangb.c clange.c clangt.c
+   clanhb.c clanhe.c
+   clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c
+   clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c
+   claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c
+   claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
+   claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
+   clarf.c  clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
+   clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c
+   clarz.c  clarzb.c clarzt.c clascl.c claset.c clasr.c  classq.c
+   clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c
+   clatbs.c clatdf.c clatps.c clatrd.c clatrs.c clatrz.c
+   cpbcon.c cpbequ.c cpbrfs.c cpbstf.c cpbsv.c
+   cpbsvx.c cpbtf2.c cpbtrf.c cpbtrs.c cpocon.c cpoequ.c cporfs.c
+   cposv.c  cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
+   cppcon.c cppequ.c cpprfs.c cppsv.c  cppsvx.c cpptrf.c cpptri.c cpptrs.c
+   cptcon.c cpteqr.c cptrfs.c cptsv.c  cptsvx.c cpttrf.c cpttrs.c cptts2.c
+   crot.c   cspcon.c csprfs.c cspsv.c
+   cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
+   cstegr.c cstein.c csteqr.c csycon.c
+   csyrfs.c csysv.c  csysvx.c csytf2.c csytrf.c csytri.c
+   csytri2.c csytri2x.c csyswapr.c
+   csytrs.c csytrs2.c
+   csyconv.c csyconvf.c csyconvf_rook.c
+   csytf2_rook.c csytrf_rook.c csytrs_rook.c
+   csytri_rook.c csycon_rook.c csysv_rook.c
+   csytf2_rk.c csytrf_rk.c csytrf_aa.c csytrf_aa_2stage.c csytrs_3.c csytrs_aa.c csytrs_aa_2stage.c
+   csytri_3.c csytri_3x.c csycon_3.c csysv_rk.c csysv_aa.c csysv_aa_2stage.c
+   ctbcon.c ctbrfs.c ctbtrs.c ctgevc.c ctgex2.c
+   ctgexc.c ctgsen.c ctgsja.c ctgsna.c ctgsy2.c ctgsyl.c ctpcon.c
+   ctprfs.c ctptri.c
+   ctptrs.c ctrcon.c ctrevc.c ctrevc3.c ctrexc.c ctrrfs.c ctrsen.c ctrsna.c
+   ctrsyl.c ctrtrs.c ctzrzf.c cung2l.c cung2r.c
+   cungbr.c cunghr.c cungl2.c cunglq.c cungql.c cungqr.c cungr2.c
+   cungrq.c cungtr.c cunm2l.c cunm2r.c cunmbr.c cunmhr.c cunml2.c cunm22.c
+   cunmlq.c cunmql.c cunmqr.c cunmr2.c cunmr3.c cunmrq.c cunmrz.c
+   cunmtr.c cupgtr.c cupmtr.c icmax1.c scsum1.c cstemr.c
+   chfrk.c ctfttp.c clanhf.c cpftrf.c cpftri.c cpftrs.c ctfsm.c ctftri.c
+   ctfttr.c ctpttf.c ctpttr.c ctrttf.c ctrttp.c
+   cgeequb.c cgbequb.c csyequb.c cpoequb.c cheequb.c
+   cbbcsd.c clapmr.c cunbdb.c cunbdb1.c cunbdb2.c cunbdb3.c cunbdb4.c
+   cunbdb5.c cunbdb6.c cuncsd.c cuncsd2by1.c
+   cgeqrt.c cgeqrt2.c cgeqrt3.c cgemqrt.c
+   ctpqrt.c ctpqrt2.c ctpmqrt.c ctprfb.c
+   cgelqt.c cgelqt3.c cgemlqt.c
+   cgetsls.c cgetsqrhrt.c cgeqr.c clatsqr.c clamtsqr.c cgemqr.c
+   cgelq.c claswlq.c clamswlq.c cgemlq.c
+   ctplqt.c ctplqt2.c ctpmlqt.c
+   chetrd_2stage.c chetrd_he2hb.c chetrd_hb2st.c chb2st_kernels.c
+   cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c
+   chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
+   cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c 
+   cungtsqr.c cungtsqr_row.c cunhr_col.c 
+   clatrs3.c ctrsyl3.c)
+
+set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
+   cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
+   csysvxx.c csyrfsx.c cla_syrfsx_extended.c cla_syamv.c
+   cla_syrcond_c.c cla_syrcond_x.c cla_syrpvgrw.c
+   cposvxx.c cporfsx.c cla_porfsx_extended.c
+   cla_porcond_c.c cla_porcond_x.c cla_porpvgrw.c
+   cgbsvxx.c cgbrfsx.c cla_gbrfsx_extended.c cla_gbamv.c
+   cla_gbrcond_c.c cla_gbrcond_x.c cla_gbrpvgrw.c
+   chesvxx.c cherfsx.c cla_herfsx_extended.c cla_heamv.c
+   cla_hercond_c.c cla_hercond_x.c cla_herpvgrw.c
+   cla_lin_berr.c clarscl2.c clascl2.c cla_wwaddw.c)
+
+set(DLASRC
+   dgbbrd.c dgbcon.c dgbequ.c dgbrfs.c dgbsv.c
+   dgbsvx.c dgbtf2.c dgbtrf.c dgbtrs.c dgebak.c dgebal.c dgebd2.c
+   dgebrd.c dgecon.c dgeequ.c dgees.c  dgeesx.c dgeev.c  dgeevx.c
+   dgehd2.c dgehrd.c dgelq2.c dgelqf.c
+   dgels.c  dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c
+   dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
+   dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c
+   dgetrf2.c dgetri.c
+   dggbak.c dggbal.c
+   dgges.c  dgges3.c dggesx.c dggev.c  dggev3.c dggevx.c
+   dggglm.c dgghrd.c dgghd3.c dgglse.c dggqrf.c
+   dggrqf.c dggsvd3.c dggsvp3.c dgtcon.c dgtrfs.c dgtsv.c
+   dgtsvx.c dgttrf.c dgttrs.c dgtts2.c dhgeqz.c
+   dhsein.c dhseqr.c dlabrd.c dlacon.c dlacn2.c
+   dlaein.c dlaexc.c dlag2.c  dlags2.c dlagtm.c dlagv2.c dlahqr.c
+   dlahr2.c dlaic1.c dlaln2.c dlals0.c dlalsa.c dlalsd.c
+   dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c
+   dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c
+   dlapll.c dlapmt.c
+   dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
+   dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
+   dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
+   dlarf.c  dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
+   dlargv.c dlarrv.c dlartv.c
+   dlarz.c  dlarzb.c dlarzt.c dlasy2.c
+   dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c
+   dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c
+   dopgtr.c dopmtr.c dorg2l.c dorg2r.c
+   dorgbr.c dorghr.c dorgl2.c dorglq.c dorgql.c dorgqr.c dorgr2.c
+   dorgrq.c dorgtr.c dorm2l.c dorm2r.c dorm22.c
+   dormbr.c dormhr.c dorml2.c dormlq.c dormql.c dormqr.c dormr2.c
+   dormr3.c dormrq.c dormrz.c dormtr.c dpbcon.c dpbequ.c dpbrfs.c
+   dpbstf.c dpbsv.c  dpbsvx.c
+   dpbtf2.c dpbtrf.c dpbtrs.c dpocon.c dpoequ.c dporfs.c dposv.c
+   dposvx.c dpotrf2.c dpotri.c dpotrs.c dpstrf.c dpstf2.c
+   dppcon.c dppequ.c
+   dpprfs.c dppsv.c  dppsvx.c dpptrf.c dpptri.c dpptrs.c dptcon.c
+   dpteqr.c dptrfs.c dptsv.c  dptsvx.c dpttrs.c dptts2.c drscl.c
+   dsbev.c  dsbevd.c dsbevx.c dsbgst.c dsbgv.c  dsbgvd.c dsbgvx.c
+   dsbtrd.c dspcon.c dspev.c  dspevd.c dspevx.c dspgst.c
+   dspgv.c  dspgvd.c dspgvx.c dsprfs.c dspsv.c  dspsvx.c dsptrd.c
+   dsptrf.c dsptri.c dsptrs.c dstegr.c dstev.c  dstevd.c dstevr.c
+   dsycon.c dsyev.c  dsyevd.c dsyevr.c
+   dsyevx.c dsygs2.c dsygst.c dsygv.c  dsygvd.c dsygvx.c dsyrfs.c
+   dsysv.c  dsysvx.c
+   dsytd2.c dsytf2.c dsytrd.c dsytrf.c dsytri.c dsytrs.c dsytrs2.c
+   dsytri2.c dsytri2x.c dsyswapr.c
+   dsyconv.c dsyconvf.c dsyconvf_rook.c
+   dsytf2_rook.c dsytrf_rook.c dsytrs_rook.c
+   dsytri_rook.c dsycon_rook.c dsysv_rook.c
+   dsytf2_rk.c dsytrf_rk.c dsytrs_3.c
+   dsytri_3.c dsytri_3x.c dsycon_3.c dsysv_rk.c
+   dsysv_aa.c dsysv_aa_2stage.c dsytrf_aa.c dsytrf_aa_2stage.c dsytrs_aa.c dsytrs_aa_2stage.c
+   dtbcon.c
+   dtbrfs.c dtbtrs.c dtgevc.c dtgex2.c dtgexc.c dtgsen.c
+   dtgsja.c dtgsna.c dtgsy2.c dtgsyl.c dtpcon.c dtprfs.c dtptri.c
+   dtptrs.c
+   dtrcon.c dtrevc.c dtrevc3.c dtrexc.c dtrrfs.c dtrsen.c dtrsna.c dtrsyl.c
+   dtrtrs.c dtzrzf.c dstemr.c
+   dsgesv.c dsposv.c dlag2s.c slag2d.c dlat2s.c
+   dlansf.c dpftrf.c dpftri.c dpftrs.c dsfrk.c dtfsm.c dtftri.c dtfttp.c
+   dtfttr.c dtpttf.c dtpttr.c dtrttf.c dtrttp.c
+   dgejsv.c dgesvj.c dgsvj0.c dgsvj1.c
+   dgeequb.c dsyequb.c dpoequb.c dgbequb.c
+   dbbcsd.c dlapmr.c dorbdb.c dorbdb1.c dorbdb2.c dorbdb3.c dorbdb4.c
+   dorbdb5.c dorbdb6.c dorcsd.c dorcsd2by1.c
+   dgeqrt.c dgeqrt2.c dgeqrt3.c dgemqrt.c
+   dtpqrt.c dtpqrt2.c dtpmqrt.c dtprfb.c
+   dgelqt.c dgelqt3.c dgemlqt.c
+   dgetsls.c dgetsqrhrt.c dgeqr.c dlatsqr.c dlamtsqr.c dgemqr.c
+   dgelq.c dlaswlq.c dlamswlq.c dgemlq.c
+   dtplqt.c dtplqt2.c dtpmlqt.c
+   dsytrd_2stage.c dsytrd_sy2sb.c dsytrd_sb2st.c dsb2st_kernels.c
+   dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c
+   dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
+   dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
+   dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c 
+   dlarmm.c dlatrs3.c dtrsyl3.c)
+
+set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
+   dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
+   dla_syrfsx_extended.c dla_syamv.c dla_syrcond.c dla_syrpvgrw.c
+   dposvxx.c dporfsx.c dla_porfsx_extended.c dla_porcond.c
+   dla_porpvgrw.c dgbsvxx.c dgbrfsx.c dla_gbrfsx_extended.c
+   dla_gbamv.c dla_gbrcond.c dla_gbrpvgrw.c dla_lin_berr.c dlarscl2.c
+   dlascl2.c dla_wwaddw.c)
+
+set(ZLASRC
+   zbdsqr.c zgbbrd.c zgbcon.c zgbequ.c zgbrfs.c zgbsv.c  zgbsvx.c
+   zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c
+   zgecon.c zgeequ.c zgees.c  zgeesx.c zgeev.c  zgeevx.c
+   zgehd2.c zgehrd.c zgelq2.c zgelqf.c
+   zgels.c  zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c
+   zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c
+   zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c
+   zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c
+   zgetc2.c zgetrf2.c
+   zgetri.c
+   zggbak.c zggbal.c
+   zgges.c  zgges3.c zggesx.c zggev.c  zggev3.c zggevx.c
+   zggglm.c zgghrd.c zgghd3.c zgglse.c zggqrf.c zggrqf.c
+   zggsvd3.c zggsvp3.c
+   zgtcon.c zgtrfs.c zgtsv.c  zgtsvx.c zgttrf.c zgttrs.c zgtts2.c zhbev.c
+   zhbevd.c zhbevx.c zhbgst.c zhbgv.c  zhbgvd.c zhbgvx.c zhbtrd.c
+   zhecon.c zheev.c  zheevd.c zheevr.c zheevx.c zhegs2.c zhegst.c
+   zhegv.c  zhegvd.c zhegvx.c zherfs.c zhesv.c  zhesvx.c zhetd2.c
+   zhetf2.c zhetrd.c
+   zhetrf.c zhetri.c zhetri2.c zhetri2x.c zheswapr.c
+   zhetrs.c zhetrs2.c
+   zhetf2_rook.c zhetrf_rook.c zhetri_rook.c
+   zhetrs_rook.c zhecon_rook.c zhesv_rook.c
+   zhetf2_rk.c zhetrf_rk.c zhetri_3.c zhetri_3x.c
+   zhetrs_3.c zhecon_3.c zhesv_rk.c
+   zhesv_aa.c zhesv_aa_2stage.c zhetrf_aa.c zhetrf_aa_2stage.c zhetrs_aa.c zhetrs_aa_2stage.c
+   zhgeqz.c zhpcon.c zhpev.c  zhpevd.c
+   zhpevx.c zhpgst.c zhpgv.c  zhpgvd.c zhpgvx.c zhprfs.c zhpsv.c
+   zhpsvx.c
+   zhptrd.c zhptrf.c zhptri.c zhptrs.c zhsein.c zhseqr.c zlabrd.c
+   zlacgv.c zlacon.c zlacn2.c zlacp2.c zlacpy.c zlacrm.c zlacrt.c zladiv.c
+   zlaed0.c zlaed7.c zlaed8.c
+   zlaein.c zlaesy.c zlaev2.c zlags2.c zlagtm.c
+   zlahef.c zlahef_rook.c zlahef_rk.c zlahef_aa.c zlahqr.c
+   zlahr2.c zlaic1.c zlals0.c zlalsa.c zlalsd.c zlangb.c zlange.c
+   zlangt.c zlanhb.c
+   zlanhe.c
+   zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c
+   zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c
+   zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c
+   zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
+   zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
+   zlarcm.c zlarf.c  zlarfb.c zlarfb_gett.c
+   zlarfg.c zlarfgp.c zlarft.c
+   zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c
+   zlarz.c  zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c
+   zlassq.c zlasyf.c zlasyf_rook.c zlasyf_rk.c zlasyf_aa.c
+   zlatbs.c zlatdf.c zlatps.c zlatrd.c zlatrs.c zlatrz.c
+   zpbcon.c zpbequ.c zpbrfs.c zpbstf.c zpbsv.c
+   zpbsvx.c zpbtf2.c zpbtrf.c zpbtrs.c zpocon.c zpoequ.c zporfs.c
+   zposv.c  zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
+   zppcon.c zppequ.c zpprfs.c zppsv.c  zppsvx.c zpptrf.c zpptri.c zpptrs.c
+   zptcon.c zpteqr.c zptrfs.c zptsv.c  zptsvx.c zpttrf.c zpttrs.c zptts2.c
+   zrot.c   zspcon.c zsprfs.c zspsv.c
+   zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
+   zstegr.c zstein.c zsteqr.c zsycon.c
+   zsyrfs.c zsysv.c  zsysvx.c zsytf2.c zsytrf.c zsytri.c
+   zsytri2.c zsytri2x.c zsyswapr.c
+   zsytrs.c zsytrs2.c
+   zsyconv.c zsyconvf.c zsyconvf_rook.c
+   zsytf2_rook.c zsytrf_rook.c zsytrs_rook.c zsytrs_aa.c zsytrs_aa_2stage.c
+   zsytri_rook.c zsycon_rook.c zsysv_rook.c
+   zsytf2_rk.c zsytrf_rk.c zsytrf_aa.c zsytrf_aa_2stage.c zsytrs_3.c
+   zsytri_3.c zsytri_3x.c zsycon_3.c zsysv_rk.c zsysv_aa.c zsysv_aa_2stage.c
+   ztbcon.c ztbrfs.c ztbtrs.c ztgevc.c ztgex2.c
+   ztgexc.c ztgsen.c ztgsja.c ztgsna.c ztgsy2.c ztgsyl.c ztpcon.c
+   ztprfs.c ztptri.c
+   ztptrs.c ztrcon.c ztrevc.c ztrevc3.c ztrexc.c ztrrfs.c ztrsen.c ztrsna.c
+   ztrsyl.c ztrtrs.c ztzrzf.c zung2l.c
+   zung2r.c zungbr.c zunghr.c zungl2.c zunglq.c zungql.c zungqr.c zungr2.c
+   zungrq.c zungtr.c zunm2l.c zunm2r.c zunmbr.c zunmhr.c zunml2.c zunm22.c
+   zunmlq.c zunmql.c zunmqr.c zunmr2.c zunmr3.c zunmrq.c zunmrz.c
+   zunmtr.c zupgtr.c
+   zupmtr.c izmax1.c dzsum1.c zstemr.c
+   zcgesv.c zcposv.c zlag2c.c clag2z.c zlat2c.c
+   zhfrk.c ztfttp.c zlanhf.c zpftrf.c zpftri.c zpftrs.c ztfsm.c ztftri.c
+   ztfttr.c ztpttf.c ztpttr.c ztrttf.c ztrttp.c
+   zgeequb.c zgbequb.c zsyequb.c zpoequb.c zheequb.c
+   zbbcsd.c zlapmr.c zunbdb.c zunbdb1.c zunbdb2.c zunbdb3.c zunbdb4.c
+   zunbdb5.c zunbdb6.c zuncsd.c zuncsd2by1.c
+   zgeqrt.c zgeqrt2.c zgeqrt3.c zgemqrt.c
+   ztpqrt.c ztpqrt2.c ztpmqrt.c ztprfb.c
+   ztplqt.c ztplqt2.c ztpmlqt.c
+   zgelqt.c zgelqt3.c zgemlqt.c
+   zgetsls.c zgetsqrhrt.c zgeqr.c zlatsqr.c zlamtsqr.c zgemqr.c
+   zgelq.c zlaswlq.c zlamswlq.c zgemlq.c
+   zhetrd_2stage.c zhetrd_he2hb.c zhetrd_hb2st.c zhb2st_kernels.c
+   zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
+   zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
+   zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
+   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c)
+
+set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
+   zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
+   zla_syrfsx_extended.c zla_syamv.c zla_syrcond_c.c zla_syrcond_x.c
+   zla_syrpvgrw.c zposvxx.c zporfsx.c zla_porfsx_extended.c
+   zla_porcond_c.c zla_porcond_x.c zla_porpvgrw.c zgbsvxx.c zgbrfsx.c
+   zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c
+   zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c
+   zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c
+   zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c)
+
+
+if(USE_XBLAS)
+  set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
+endif()
+
+list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
+  DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
+  DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
+list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c
+  DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c
+  DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c)
+list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c
+  DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c
+  DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c)
+list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
+  DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
+  DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
+message(STATUS "Building deprecated routines")
+
+set(DSLASRC spotrs.c)
+
+set(ZCLASRC cpotrs.c)
+
+set(SCATGEN slatm1.c slaran.c slarnd.c)
+
+set(SMATGEN slatms.c slatme.c slatmr.c slatmt.c
+   slagge.c slagsy.c slakf2.c slarge.c slaror.c slarot.c slatm2.c
+   slatm3.c slatm5.c slatm6.c slatm7.c slahilb.c)
+
+set(CMATGEN clatms.c clatme.c clatmr.c clatmt.c
+   clagge.c claghe.c clagsy.c clakf2.c clarge.c claror.c clarot.c
+   clatm1.c clarnd.c clatm2.c clatm3.c clatm5.c clatm6.c clahilb.c slatm7.c)
+
+set(DZATGEN dlatm1.c dlaran.c dlarnd.c)
+
+set(DMATGEN dlatms.c dlatme.c dlatmr.c dlatmt.c
+   dlagge.c dlagsy.c dlakf2.c dlarge.c dlaror.c dlarot.c dlatm2.c
+   dlatm3.c dlatm5.c dlatm6.c dlatm7.c dlahilb.c)
+
+set(ZMATGEN zlatms.c zlatme.c zlatmr.c zlatmt.c
+  zlagge.c zlaghe.c zlagsy.c zlakf2.c zlarge.c zlaror.c zlarot.c
+  zlatm1.c zlarnd.c zlatm2.c zlatm3.c zlatm5.c zlatm6.c zlahilb.c dlatm7.c)
+
+if(BUILD_SINGLE)
+  set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX})
+  set(LA_GEN_SRC ${SMATGEN} ${SCATGEN})
+  message(STATUS "Building Single Precision")
+endif()
+if(BUILD_DOUBLE)
+  set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX})
+  set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN})
+  message(STATUS "Building Double Precision")
+endif()
+if(BUILD_COMPLEX)
+  set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX})
+  SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN})
+  message(STATUS "Building Single Precision Complex")
+endif()
+if(BUILD_COMPLEX16)
+  set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX})
+  SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN})
+# for zlange/zlanhe
+  if (NOT BUILD_DOUBLE)
+    set (LA_REL_SRC ${LA_REL_SRC} dcombssq.c)
+  endif	()  
+  message(STATUS "Building Double Precision Complex")
+endif()
+
+endif()
+
+# add lapack-netlib folder to the sources
+set(LA_SOURCES "")
+foreach (LA_FILE ${LA_REL_SRC})
+  list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/SRC/${LA_FILE}")
+endforeach ()
+foreach (LA_FILE ${LA_GEN_SRC})
+  list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/TESTING/MATGEN/${LA_FILE}")
+endforeach ()
+
+if (NOT C_LAPACK)
+  set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
+  if (${F_COMPILER} STREQUAL "GFORTRAN")
+    set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize")
+  endif()
+else ()
+  set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
+endif ()

From 1688c7da439c377c0e7c8491c711655f1ff1c2ef Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 16 Nov 2022 22:24:01 +0100
Subject: [PATCH 100/154] change line endings from CRLF to LF

---
 kernel/arm64/sgemm_ncopy_4.S                  |   666 +-
 kernel/arm64/sgemm_tcopy_16.S                 |  1628 +-
 kernel/power/cgemm_kernel_power9.S            |   586 +-
 kernel/power/cgemm_logic_power9.S             |  5632 +++----
 kernel/power/cgemm_macros_power9.S            |  6036 +++----
 kernel/power/cgemv_n.c                        |  1194 +-
 kernel/power/cgemv_t.c                        |  1202 +-
 kernel/power/crot.c                           |   466 +-
 kernel/power/dgemm_kernel_power9.S            |   498 +-
 kernel/power/dgemm_logic_power9.S             |  3962 ++---
 kernel/power/dgemm_macros_power9.S            |  7244 ++++----
 kernel/power/icamax.c                         |   656 +-
 kernel/power/icamin.c                         |   532 +-
 kernel/power/isamax.c                         |   576 +-
 kernel/power/isamin.c                         |   576 +-
 kernel/power/sgemm_kernel_power9.S            |   544 +-
 kernel/power/sgemm_logic_power9.S             |  4382 ++---
 kernel/power/sgemm_macros_power9.S            | 11148 ++++++-------
 kernel/power/sgemv_n.c                        |   940 +-
 kernel/power/sgemv_n_8.c                      |  1028 +-
 kernel/power/sgemv_t.c                        |   968 +-
 kernel/power/sgemv_t_8.c                      |  1016 +-
 kernel/power/zgemm_kernel_power9.S            |   488 +-
 kernel/power/zgemm_logic_power9.S             |  3780 ++---
 kernel/power/zgemm_macros_power9.S            |  3648 ++---
 kernel/x86_64/sgemm_kernel_16x4_haswell.S     | 13612 ++++++++--------
 .../strsm_kernel_8x4_haswell_R_common.h       |   452 +-
 kernel/x86_64/zgemm_kernel_2x2_bulldozer.S    |  2808 ++--
 kernel/x86_64/zgemm_kernel_2x2_piledriver.S   |  2858 ++--
 kernel/x86_64/zgemm_kernel_4x2_haswell.S      |  7762 ++++-----
 relapack/src/CMakeLists.txt                   |   172 +-
 31 files changed, 43530 insertions(+), 43530 deletions(-)

diff --git a/kernel/arm64/sgemm_ncopy_4.S b/kernel/arm64/sgemm_ncopy_4.S
index 30450cc7d..c819ee6fb 100644
--- a/kernel/arm64/sgemm_ncopy_4.S
+++ b/kernel/arm64/sgemm_ncopy_4.S
@@ -1,333 +1,333 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define ASSEMBLER
-#include "common.h"
-
-#define	M	x0
-#define	N	x1
-#define	A00	x2
-#define	LDA	x3
-#define	B00	x4
-
-#define	A01	x5
-#define	A02	x6
-#define	A03	x7
-#define	A04	x8
-
-#define I	x9
-#define	J	x10
-
-#define	TEMP1	x11
-#define	TEMP2	x12
-
-#define A_PREFETCH	2560
-
-/**************************************************************************************
-* Macro definitions
-**************************************************************************************/
-
-.macro SAVE_REGS
-	add	sp, sp, #-(11 * 16)
-	stp	d8, d9, [sp, #(0 * 16)]
-	stp	d10, d11, [sp, #(1 * 16)]
-	stp	d12, d13, [sp, #(2 * 16)]
-	stp	d14, d15, [sp, #(3 * 16)]
-	stp	d16, d17, [sp, #(4 * 16)]
-	stp	x18, x19, [sp, #(5 * 16)]
-	stp	x20, x21, [sp, #(6 * 16)]
-	stp	x22, x23, [sp, #(7 * 16)]
-	stp	x24, x25, [sp, #(8 * 16)]
-	stp	x26, x27, [sp, #(9 * 16)]
-	str	x28, [sp, #(10 * 16)]
-.endm
-
-.macro RESTORE_REGS
-	ldp	d8, d9, [sp, #(0 * 16)]
-	ldp	d10, d11, [sp, #(1 * 16)]
-	ldp	d12, d13, [sp, #(2 * 16)]
-	ldp	d14, d15, [sp, #(3 * 16)]
-	ldp	d16, d17, [sp, #(4 * 16)]
-	ldp	x18, x19, [sp, #(5 * 16)]
-	ldp	x20, x21, [sp, #(6 * 16)]
-	ldp	x22, x23, [sp, #(7 * 16)]
-	ldp	x24, x25, [sp, #(8 * 16)]
-	ldp	x26, x27, [sp, #(9 * 16)]
-	ldr	x28, [sp, #(10 * 16)]
-	add	sp, sp, #(11*16)
-.endm
-
-.macro COPY4x4
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	q0, [A01], #16
-	ins	v8.s[0], v0.s[0]
-	ins	v9.s[0], v0.s[1]
-	ins	v10.s[0], v0.s[2]
-	ins	v11.s[0], v0.s[3]
-
-	ldr	q1, [A02], #16
-	ins	v8.s[1], v1.s[0]
-	ins	v9.s[1], v1.s[1]
-	ins	v10.s[1], v1.s[2]
-	ins	v11.s[1], v1.s[3]
-
-	ldr	q2, [A03], #16
-	ins	v8.s[2], v2.s[0]
-	ins	v9.s[2], v2.s[1]
-	ins	v10.s[2], v2.s[2]
-	ins	v11.s[2], v2.s[3]
-
-	ldr	q3, [A04], #16
-	ins	v8.s[3], v3.s[0]
-	ins	v9.s[3], v3.s[1]
-	ins	v10.s[3], v3.s[2]
-	ins	v11.s[3], v3.s[3]
-
-	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
-	add	B00, B00, #64
-
-.endm
-
-.macro COPY1x4
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	s0, [A01], #4
-	ldr	s1, [A02], #4
-	ldr	s2, [A03], #4
-	ldr	s3, [A04], #4
-
-	stp	s0, s1, [B00]
-	add	B00, B00, #8
-   	stp	s2, s3, [B00]
-	add	B00, B00, #8
-.endm
-
-.macro COPY4x2
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	q0, [A01], #16
-	ins	v8.s[0], v0.s[0]
-	ins	v9.s[0], v0.s[1]
-	ins	v10.s[0], v0.s[2]
-	ins	v11.s[0], v0.s[3]
-
-	ldr	q1, [A02], #16
-	ins	v8.s[1], v1.s[0]
-	ins	v9.s[1], v1.s[1]
-	ins	v10.s[1], v1.s[2]
-	ins	v11.s[1], v1.s[3]
-
-	st1	{v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
-	add	B00, B00, #32
-.endm
-
-
-.macro COPY1x2
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	s0, [A01], #4
-	ldr	s1, [A02], #4
-
-	stp	s0, s1, [B00]
-	add	B00, B00, #8
-.endm
-
-.macro COPY4x1
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr	q0, [A01], #16
-	str	q0, [B00], #16
-.endm
-
-
-.macro COPY1x1
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr	s0, [A01], #4
-	str	s0, [B00], #4
-.endm
-
-/**************************************************************************************
-* End of macro definitions
-**************************************************************************************/
-
-	PROLOGUE
-
-	.align 5
-
-	SAVE_REGS
-
-	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
-
-.Ldgemm_ncopy_L4_BEGIN:
-
-	asr	J, N, #2					// J = N / 4
-	cmp 	J, #0
-	ble	.Ldgemm_ncopy_L2_BEGIN
-
-	.align	5
-.Ldgemm_ncopy_L4_M4_BEGIN:
-
-	mov	A01, A00
-	add	A02, A01, LDA
-	add	A03, A02, LDA
-	add	A04, A03, LDA
-	add	A00, A04, LDA
-
-	asr	I, M, #2					// I = M / 4
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L4_M4_40
-
-	.align	5
-.Ldgemm_ncopy_L4_M4_20:
-
-	COPY4x4
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L4_M4_20
-
-.Ldgemm_ncopy_L4_M4_40:
-
-	and	I, M , #3
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L4_M4_END
-
-	.align	5
-.Ldgemm_ncopy_L4_M4_60:
-
-	COPY1x4
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L4_M4_60
-
-.Ldgemm_ncopy_L4_M4_END:
-
-	subs	J , J, #1						// j--
-	bne	.Ldgemm_ncopy_L4_M4_BEGIN
-
-/*********************************************************************************************/
-
-.Ldgemm_ncopy_L2_BEGIN:
-
-	tst	N, #3
-	ble	.Ldgemm_ncopy_L999
-
-	tst	N, #2
-	ble	.Ldgemm_ncopy_L1_BEGIN
-
-.Ldgemm_ncopy_L2_M4_BEGIN:
-	mov	A01, A00
-	add	A02, A01, LDA
-	add	A00, A02, LDA
-
-	asr	I, M, #2					// I = M / 4
-	cmp 	I, #0
-	ble	.Ldgemm_ncopy_L2_M4_40
-
-	.align	5
-.Ldgemm_ncopy_L2_M4_20:
-
-	COPY4x2
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L2_M4_20
-
-.Ldgemm_ncopy_L2_M4_40:
-
-	and	I, M , #3
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L2_M4_END
-
-	.align	5
-.Ldgemm_ncopy_L2_M4_60:
-
-	COPY1x2
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L2_M4_60
-
-.Ldgemm_ncopy_L2_M4_END:
-
-
-/*********************************************************************************************/
-
-.Ldgemm_ncopy_L1_BEGIN:
-
-	tst	N, #1
-	ble	.Ldgemm_ncopy_L999
-
-.Ldgemm_ncopy_L1_M4_BEGIN:
-
-	mov	A01, A00
-
-	asr	I, M, #2					// I = M / 4
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L1_M4_40
-
-	.align	5
-.Ldgemm_ncopy_L1_M4_20:
-
-	COPY4x1
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L1_M4_20
-
-
-.Ldgemm_ncopy_L1_M4_40:
-
-	and	I, M , #3
-	cmp	I, #0
-	ble	.Ldgemm_ncopy_L1_M4_END
-
-	.align	5
-.Ldgemm_ncopy_L1_M4_60:
-
-	COPY1x1
-
-	subs	I , I , #1
-	bne	.Ldgemm_ncopy_L1_M4_60
-
-
-.Ldgemm_ncopy_L1_M4_END:
-
-.Ldgemm_ncopy_L999:
-
-	mov	x0, #0
-	RESTORE_REGS
-	ret
-
-	EPILOGUE
-
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M	x0
+#define	N	x1
+#define	A00	x2
+#define	LDA	x3
+#define	B00	x4
+
+#define	A01	x5
+#define	A02	x6
+#define	A03	x7
+#define	A04	x8
+
+#define I	x9
+#define	J	x10
+
+#define	TEMP1	x11
+#define	TEMP2	x12
+
+#define A_PREFETCH	2560
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+.macro COPY4x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v10.s[0], v0.s[2]
+	ins	v11.s[0], v0.s[3]
+
+	ldr	q1, [A02], #16
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+	ins	v10.s[1], v1.s[2]
+	ins	v11.s[1], v1.s[3]
+
+	ldr	q2, [A03], #16
+	ins	v8.s[2], v2.s[0]
+	ins	v9.s[2], v2.s[1]
+	ins	v10.s[2], v2.s[2]
+	ins	v11.s[2], v2.s[3]
+
+	ldr	q3, [A04], #16
+	ins	v8.s[3], v3.s[0]
+	ins	v9.s[3], v3.s[1]
+	ins	v10.s[3], v3.s[2]
+	ins	v11.s[3], v3.s[3]
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
+	add	B00, B00, #64
+
+.endm
+
+.macro COPY1x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+	ldr	s2, [A03], #4
+	ldr	s3, [A04], #4
+
+	stp	s0, s1, [B00]
+	add	B00, B00, #8
+   	stp	s2, s3, [B00]
+	add	B00, B00, #8
+.endm
+
+.macro COPY4x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v10.s[0], v0.s[2]
+	ins	v11.s[0], v0.s[3]
+
+	ldr	q1, [A02], #16
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+	ins	v10.s[1], v1.s[2]
+	ins	v11.s[1], v1.s[3]
+
+	st1	{v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
+	add	B00, B00, #32
+.endm
+
+
+.macro COPY1x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+
+	stp	s0, s1, [B00]
+	add	B00, B00, #8
+.endm
+
+.macro COPY4x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	str	q0, [B00], #16
+.endm
+
+
+.macro COPY1x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	str	s0, [B00], #4
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	SAVE_REGS
+
+	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
+
+.Ldgemm_ncopy_L4_BEGIN:
+
+	asr	J, N, #2					// J = N / 4
+	cmp 	J, #0
+	ble	.Ldgemm_ncopy_L2_BEGIN
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_BEGIN:
+
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A00, A04, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L4_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_20:
+
+	COPY4x4
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L4_M4_20
+
+.Ldgemm_ncopy_L4_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L4_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_60:
+
+	COPY1x4
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L4_M4_60
+
+.Ldgemm_ncopy_L4_M4_END:
+
+	subs	J , J, #1						// j--
+	bne	.Ldgemm_ncopy_L4_M4_BEGIN
+
+/*********************************************************************************************/
+
+.Ldgemm_ncopy_L2_BEGIN:
+
+	tst	N, #3
+	ble	.Ldgemm_ncopy_L999
+
+	tst	N, #2
+	ble	.Ldgemm_ncopy_L1_BEGIN
+
+.Ldgemm_ncopy_L2_M4_BEGIN:
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A00, A02, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp 	I, #0
+	ble	.Ldgemm_ncopy_L2_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L2_M4_20:
+
+	COPY4x2
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L2_M4_20
+
+.Ldgemm_ncopy_L2_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L2_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L2_M4_60:
+
+	COPY1x2
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L2_M4_60
+
+.Ldgemm_ncopy_L2_M4_END:
+
+
+/*********************************************************************************************/
+
+.Ldgemm_ncopy_L1_BEGIN:
+
+	tst	N, #1
+	ble	.Ldgemm_ncopy_L999
+
+.Ldgemm_ncopy_L1_M4_BEGIN:
+
+	mov	A01, A00
+
+	asr	I, M, #2					// I = M / 4
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L1_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L1_M4_20:
+
+	COPY4x1
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L1_M4_20
+
+
+.Ldgemm_ncopy_L1_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L1_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L1_M4_60:
+
+	COPY1x1
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L1_M4_60
+
+
+.Ldgemm_ncopy_L1_M4_END:
+
+.Ldgemm_ncopy_L999:
+
+	mov	x0, #0
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S
index 431f1ae2a..3066421bb 100644
--- a/kernel/arm64/sgemm_tcopy_16.S
+++ b/kernel/arm64/sgemm_tcopy_16.S
@@ -1,814 +1,814 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-*****************************************************************************/
-
-#define ASSEMBLER
-#include "common.h"
-
-#define	M		x0
-#define	N		x1
-#define	A		x2
-#define	LDA		x3
-#define	B		x4
-
-#define M8		x5
-
-#define	A01		x6
-#define	A02		x7
-#define	A03		x8
-#define	A04		x9
-#define	A05		x10
-#define	A06		x11
-#define	A07		x12
-#define	A08		x13
-
-#define	B01		x14
-#define	B02		x15
-#define	B03		x16
-#define	B04		x17
-#define	B00		x22
-
-
-#define I		x21
-#define	J		x19
-
-#define TEMP1		x20
-
-#define A_PREFETCH	256
-
-/**************************************************************************************
-* Macro definitions
-**************************************************************************************/
-.macro SAVE_REGS
-	add	sp, sp, #-(11 * 16)
-	stp	d8, d9, [sp, #(0 * 16)]
-	stp	d10, d11, [sp, #(1 * 16)]
-	stp	d12, d13, [sp, #(2 * 16)]
-	stp	d14, d15, [sp, #(3 * 16)]
-	stp	d16, d17, [sp, #(4 * 16)]
-	stp	x18, x19, [sp, #(5 * 16)]
-	stp	x20, x21, [sp, #(6 * 16)]
-	stp	x22, x23, [sp, #(7 * 16)]
-	stp	x24, x25, [sp, #(8 * 16)]
-	stp	x26, x27, [sp, #(9 * 16)]
-	str	x28, [sp, #(10 * 16)]
-.endm
-
-.macro RESTORE_REGS
-	ldp	d8, d9, [sp, #(0 * 16)]
-	ldp	d10, d11, [sp, #(1 * 16)]
-	ldp	d12, d13, [sp, #(2 * 16)]
-	ldp	d14, d15, [sp, #(3 * 16)]
-	ldp	d16, d17, [sp, #(4 * 16)]
-	ldp	x18, x19, [sp, #(5 * 16)]
-	ldp	x20, x21, [sp, #(6 * 16)]
-	ldp	x22, x23, [sp, #(7 * 16)]
-	ldp	x24, x25, [sp, #(8 * 16)]
-	ldp	x26, x27, [sp, #(9 * 16)]
-	ldr	x28, [sp, #(10 * 16)]
-	add	sp, sp, #(11*16)
-.endm
-
-/*************************************************************************************************************************/
-
-.macro COPY16x8
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
-	//prfm	PSTL1KEEP, [B00, M8]
-	
-	ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
-	add  A01, A01, #64
-	
-	st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
-	add TEMP1, B00, #64
-
-	ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
-	add  A02, A02, #64
-	
-	st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
-	add  A03, A03, #64
-	
-	st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
-	add  A04, A04, #64
-	
-	st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05]
-	add  A05, A05, #64
-	
-	st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06]
-	add  A06, A06, #64
-	
-	st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07]
-	add  A07, A07, #64
-	
-	st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-
-	ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08]
-	add  A08, A08, #64
-	
-	st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1]
-	add TEMP1, TEMP1, #64
-	
-	add	B00, B00, M8
-
-.endm
-
-.macro COPY8x8
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
-
-	ldp	q0, q1, [A01]
-	ldp	q2, q3, [A02]
-	add	A01, A01, #32
-	add	A02, A02, #32
-	
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
-	add	B01, B01, #64
-	
-	ldp	q4, q5, [A03]
-	ldp	q6, q7, [A04]
-	add	A03, A03, #32
-	add	A04, A04, #32
-
-	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
-	add	B01, B01, #64
-
-	ldp	q8, q9, [A05]
-	ldp	q10, q11, [A06]
-	add	A05, A05, #32
-	add	A06, A06, #32
-
-	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B01]
-	add	B01, B01, #64
-
-	ldp	q12, q13, [A07]
-	ldp	q14, q15, [A08]
-	add	A07, A07, #32
-	add	A08, A08, #32
-
-	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [B01]
-	add	B01, B01, #64
-.endm
-
-.macro COPY4x8
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
-
-	ldr	q0, [A01]
-	ldr	q1, [A02]
-	ldr	q2, [A03]
-	ldr	q3, [A04]
-	add	A01, A01, #16
-	add	A02, A02, #16
-	add	A03, A03, #16
-	add	A04, A04, #16
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
-	add	B02, B02, #64
-
-	ldr	q4, [A05]
-	ldr	q5, [A06]
-	ldr	q6, [A07]
-	ldr	q7, [A08]
-
-	add	A05, A05, #16
-	add	A06, A06, #16
-	add	A07, A07, #16
-	add	A08, A08, #16
-
-	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B02]
-	add	B02, B02, #64
-.endm
-
-.macro COPY2x8
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
-
-	ldr	d0, [A01]
-	ldr	d1, [A02]
-	ldr	d2, [A03]
-	ldr	d3, [A04]
-	
-	add	A01, A01, #8
-	add	A02, A02, #8
-	add	A03, A03, #8
-	add	A04, A04, #8
-
-	stp	d0, d1, [B03]
-	add	B03, B03, #16
-	stp	d2, d3, [B03]
-	add	B03, B03, #16
-
-	ldr	d4, [A05]
-	ldr	d5, [A06]
-	ldr	d6, [A07]
-	ldr	d7, [A08]
-	
-	add	A05, A05, #8
-	add	A06, A06, #8
-	add	A07, A07, #8
-	add	A08, A08, #8
-
-	stp	d4, d5, [B03]
-	add	B03, B03, #16
-	stp	d6, d7, [B03]
-	add	B03, B03, #16
-
-.endm
-
-.macro COPY1x8
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
-
-	ldr	s0, [A01]
-	ldr	s1, [A02]
-	ldr	s2, [A03]
-	ldr	s3, [A04]
-
-	stp	s0, s1, [B04]
-	add	B04, B04, #8
-	stp	s2, s3, [B04]
-	add	B04, B04, #8
-
-	ldr	s4, [A05]
-	ldr	s5, [A06]
-	ldr	s6, [A07]
-	ldr	s7, [A08]
-
-	stp	s4, s5, [B04]
-	add	B04, B04, #8
-	stp	s6, s7, [B04]
-	add	B04, B04, #8
-
-.endm
-
-/*************************************************************************************************************************/
-.macro COPY16x4
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
-	add	A01, A01, #64
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
-	add	TEMP1, B00, #64
-
-	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
-	add	A02, A02, #64
-
-	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
-	add	TEMP1, TEMP1, #64
-
-	ld1	{v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
-	add	A03, A03, #64
-
-	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
-	add	TEMP1, TEMP1, #64
-
-	ld1	{v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
-	add	A04, A04, #64
-
-	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
-
-	add	B00, B00, M8
-.endm
-
-.macro COPY8x4
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldp	q0, q1, [A01]
-	ldp	q2, q3, [A02]
-	add	A01, A01, #32
-	add	A02, A02, #32
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
-	add	B01, B01, #64
-
-	ldp	q4, q5, [A03]
-	ldp	q6, q7, [A04]
-	add	A03, A03, #32
-	add	A04, A04, #32
-
-	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
-	add	B01, B01, #64
-.endm
-
-.macro COPY4x4
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	q0, [A01]
-	ldr	q1, [A02]
-	ldr	q2, [A03]
-	ldr	q3, [A04]
-	add	A01, A01, #16
-	add	A02, A02, #16
-	add	A03, A03, #16
-	add	A04, A04, #16
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
-
-	add	B02, B02, #64
-.endm
-
-.macro COPY2x4
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	d0, [A01]
-	ldr	d1, [A02]
-	ldr	d2, [A03]
-	ldr	d3, [A04]
-	
-	add	A01, A01, #8
-	add	A02, A02, #8
-	add	A03, A03, #8
-	add	A04, A04, #8
-	
-	stp	d0, d1, [B03]
-	add	B03, B03, #16
-	stp	d2, d3, [B03]
-
-	add	B03, B03, #16
-.endm
-
-.macro COPY1x4
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
-
-	ldr	s0, [A01]
-	ldr	s1, [A02]
-	ldr	s2, [A03]
-	ldr	s3, [A04]
-	
-	add	A01, A01, #4
-	add	A02, A02, #4
-	add	A03, A03, #4
-	add	A04, A04, #4
-
-	stp	s0, s1, [B04]
-	add	B04, B04, #8
-	stp	s2, s3, [B04]
-	add	B04, B04, #8
-
-.endm
-
-/*************************************************************************************************************************/
-
-.macro COPY16x2
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
-	add	A01, A01, #64
-	
-	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
-	add	A02, A02, #64
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
-	add	TEMP1, B00, #64
-	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
-	add	B00, B00, M8
-.endm
-
-.macro COPY8x2
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ld1	{v0.4s, v1.4s}, [A01]
-	ld1	{v2.4s, v3.4s}, [A02]
-	add	A01, A01, #32
-	add	A02, A02, #32
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
-	add	B01, B01, #64
-.endm
-
-.macro COPY4x2
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	q0, [A01]
-	ldr	q1, [A02]
-	add	A01, A01, #16
-	add	A02, A02, #16
-
-	stp	q0, q1, [B02]
-	add	B02, B02, #32
-.endm
-
-.macro COPY2x2
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	d0, [A01]
-	ldr	d1, [A02]
-	
-	add	A01, A01, #8
-	add	A02, A02, #8
-	
-	stp	d0, d1, [B03]
-	add	B03, B03, #16
-.endm
-
-.macro COPY1x2
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
-
-	ldr	s0, [A01]
-	ldr	s1, [A02]
-	
-	add	A01, A01, #4
-	add	A02, A02, #4
-
-	stp	s0, s1, [B04]
-
-	add	B04, B04, #8
-.endm
-
-/*************************************************************************************************************************/
-
-.macro COPY16x1
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
-	add	A01, A01, #64
-
-	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
-	add	B00, B00, M8
-.endm
-
-.macro COPY8x1
-	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-	
-	ldp	q0, q1, [A01]
-	add	A01, A01, #32
-	stp	q0, q1, [B01]
-
-	add	B01, B01, #32
-.endm
-
-.macro COPY4x1
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr q0, [A01]
-	add	A01, A01, #16
-	str q0, [B02]
-
-	add	B02, B02, #16
-.endm
-
-.macro COPY2x1
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr	d0, [A01]
-	add	A01, A01, #8
-	str d0, [B03]
-
-	add	B03, B03, #8
-.endm
-
-.macro COPY1x1
-	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
-
-	ldr	s0, [A01]
-	add	A01, A01, #4
-	str	s0, [B04]
-
-	add	B04, B04, #4
-.endm
-
-/**************************************************************************************
-* End of macro definitions
-**************************************************************************************/
-
-	PROLOGUE
-
-	.align 5
-
-	SAVE_REGS
-
-	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
-
-	lsl	TEMP1, M, #2					// TEMP1 = M * SIZE
-
-	and	B01 , N , #-16
-	and	B02 , N , #-8
-	and	B03 , N , #-4
-	and	B04 , N , #-2
-
-	mul	B01, B01, TEMP1
-	mul	B02, B02, TEMP1
-	mul	B03, B03, TEMP1
-	mul	B04, B04, TEMP1
-
-	add	B01 , B01, B
-	add	B02 , B02, B
-	add	B03 , B03, B
-	add	B04 , B04, B
-
-	lsl	M8, M, #6					// M8 = M * 16 * SIZE
-
-.Lsgemm_tcopy_L8_BEGIN:
-	asr 	J, M, #3					// J = M / 8
-	cmp 	J, #0
-	ble	.Lsgemm_tcopy_L4_BEGIN
-
-	.align	5
-.Lsgemm_tcopy_L8_M16_BEGIN:
-
-	mov	A01, A
-	add	A02, A01, LDA
-	add	A03, A02, LDA
-	add	A04, A03, LDA
-	add	A05, A04, LDA
-	add	A06, A05, LDA
-	add	A07, A06, LDA
-	add	A08, A07, LDA
-	add	A, A08, LDA
-
-	mov	B00, B
-	add	B, B00, #512					// B = B + 8 * 16 * SIZE
-
-	asr	I, N, #4					// I = N / 16
-	cmp 	I, #0
-	ble	.Lsgemm_tcopy_L8_M16_40
-
-	.align	5
-.Lsgemm_tcopy_L8_M16_20:
-
-	COPY16x8
-
-	subs	I , I , #1
-	bne	.Lsgemm_tcopy_L8_M16_20
-
-.Lsgemm_tcopy_L8_M16_40:
-	tst	N , #8
-	ble	.Lsgemm_tcopy_L8_M16_60
-
-	COPY8x8
-	
-.Lsgemm_tcopy_L8_M16_60:
-	tst	N , #4
-	ble	.Lsgemm_tcopy_L8_M16_80
-
-	COPY4x8
-
-.Lsgemm_tcopy_L8_M16_80:
-
-	tst	N , #2
-	ble	.Lsgemm_tcopy_L8_M16_100
-
-	COPY2x8
-
-.Lsgemm_tcopy_L8_M16_100:
-
-	tst	N, #1
-	ble	.Lsgemm_tcopy_L8_M16_END
-
-	COPY1x8
-
-.Lsgemm_tcopy_L8_M16_END:
-
-	subs	J , J, #1						// j--
-	bne	.Lsgemm_tcopy_L8_M16_BEGIN
-
-/*********************************************************************************************/
-
-.Lsgemm_tcopy_L4_BEGIN:
-	tst	M, #7
-	ble	.Lsgemm_tcopy_L999
-
-	tst	M, #4
-	ble	.Lsgemm_tcopy_L2_BEGIN
-	
-.Lsgemm_tcopy_L4_M16_BEGIN:
-
-	mov	A01, A
-	add	A02, A01, LDA
-	add	A03, A02, LDA
-	add	A04, A03, LDA
-	add	A, A04, LDA
-
-	mov	B00, B
-	add	B, B00, #256					// B = B + 4 * 16 * SIZE
-
-	asr	I, N, #4					// I = N / 16
-	cmp 	I, #0
-	ble	.Lsgemm_tcopy_L4_M16_40
-
-	.align	5
-.Lsgemm_tcopy_L4_M16_20:
-
-	COPY16x4
-
-	subs	I , I , #1
-	bne	.Lsgemm_tcopy_L4_M16_20
-
-.Lsgemm_tcopy_L4_M16_40:
-	tst	N , #8
-	ble	.Lsgemm_tcopy_L4_M16_60
-
-	COPY8x4
-	
-.Lsgemm_tcopy_L4_M16_60:
-	tst	N , #4
-	ble	.Lsgemm_tcopy_L4_M16_80
-
-	COPY4x4
-
-.Lsgemm_tcopy_L4_M16_80:
-
-	tst	N , #2
-	ble	.Lsgemm_tcopy_L4_M16_100
-
-	COPY2x4
-
-
-.Lsgemm_tcopy_L4_M16_100:
-
-	tst	N, #1
-	ble	.Lsgemm_tcopy_L4_M16_END
-
-	COPY1x4
-
-
-.Lsgemm_tcopy_L4_M16_END:
-
-/*********************************************************************************************/
-
-.Lsgemm_tcopy_L2_BEGIN:
-
-	tst	M, #3
-	ble	.Lsgemm_tcopy_L999
-
-	tst	M, #2
-	ble	.Lsgemm_tcopy_L1_BEGIN
-
-.Lsgemm_tcopy_L2_M16_BEGIN:
-	mov	A01, A
-	add	A02, A01, LDA
-	add	A, A02, LDA
-
-	mov	B00, B
-	add	B, B00, #128					// B = B + 2 * 16 * SIZE
-
-	asr	I, N, #4					// I = N / 16
-	cmp 	I, #0
-	ble	.Lsgemm_tcopy_L2_M16_40
-
-	.align	5
-.Lsgemm_tcopy_L2_M16_20:
-
-	COPY16x2
-
-	subs	I , I , #1
-	bne	.Lsgemm_tcopy_L2_M16_20
-
-.Lsgemm_tcopy_L2_M16_40:
-	tst	N , #8
-	ble	.Lsgemm_tcopy_L2_M16_60
-
-	COPY8x2
-
-.Lsgemm_tcopy_L2_M16_60:
-	tst	N , #4
-	ble	.Lsgemm_tcopy_L2_M16_80
-
-	COPY4x2
-
-.Lsgemm_tcopy_L2_M16_80:
-
-	tst	N , #2
-	ble	.Lsgemm_tcopy_L2_M16_100
-
-	COPY2x2
-
-.Lsgemm_tcopy_L2_M16_100:
-
-	tst	N , #1
-	ble	.Lsgemm_tcopy_L2_M16_END
-
-	COPY1x2
-
-.Lsgemm_tcopy_L2_M16_END:
-
-/*********************************************************************************************/
-
-.Lsgemm_tcopy_L1_BEGIN:
-
-	tst	M, #1
-	ble	.Lsgemm_tcopy_L999
-
-
-.Lsgemm_tcopy_L1_M16_BEGIN:
-
-	mov	A01, A						// A01 = A
-	mov	B00, B
-
-	asr	I, N, #4					// I = M / 16
-	cmp 	I, #0
-	ble	.Lsgemm_tcopy_L1_M16_40
-
-	.align	5
-.Lsgemm_tcopy_L1_M16_20:
-
-	COPY16x1
-
-	subs	I , I , #1
-	bne	.Lsgemm_tcopy_L1_M16_20
-	
-.Lsgemm_tcopy_L1_M16_40:
-	tst	N , #8
-	ble	.Lsgemm_tcopy_L1_M16_60
-
-	COPY8x1
-
-.Lsgemm_tcopy_L1_M16_60:
-	tst	N , #4
-	ble	.Lsgemm_tcopy_L1_M16_80
-
-	COPY4x1
-
-.Lsgemm_tcopy_L1_M16_80:
-
-	tst	N , #2
-	ble	.Lsgemm_tcopy_L1_M16_100
-
-	COPY2x1
-
-.Lsgemm_tcopy_L1_M16_100:
-
-	tst	N , #1
-	ble	.Lsgemm_tcopy_L1_M16_END
-
-	COPY1x1
-
-
-.Lsgemm_tcopy_L1_M16_END:
-
-.Lsgemm_tcopy_L999:
-	mov	x0, #0						// set return value
-	RESTORE_REGS
-	ret
-
-	EPILOGUE
-
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M		x0
+#define	N		x1
+#define	A		x2
+#define	LDA		x3
+#define	B		x4
+
+#define M8		x5
+
+#define	A01		x6
+#define	A02		x7
+#define	A03		x8
+#define	A04		x9
+#define	A05		x10
+#define	A06		x11
+#define	A07		x12
+#define	A08		x13
+
+#define	B01		x14
+#define	B02		x15
+#define	B03		x16
+#define	B04		x17
+#define	B00		x22
+
+
+#define I		x21
+#define	J		x19
+
+#define TEMP1		x20
+
+#define A_PREFETCH	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x8
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+	//prfm	PSTL1KEEP, [B00, M8]
+	
+	ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add  A01, A01, #64
+	
+	st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add TEMP1, B00, #64
+
+	ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add  A02, A02, #64
+	
+	st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
+	add  A03, A03, #64
+	
+	st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
+	add  A04, A04, #64
+	
+	st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05]
+	add  A05, A05, #64
+	
+	st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06]
+	add  A06, A06, #64
+	
+	st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07]
+	add  A07, A07, #64
+	
+	st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08]
+	add  A08, A08, #64
+	
+	st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+	
+	add	B00, B00, M8
+
+.endm
+
+.macro COPY8x8
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+	
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+	
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+	add	A03, A03, #32
+	add	A04, A04, #32
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q8, q9, [A05]
+	ldp	q10, q11, [A06]
+	add	A05, A05, #32
+	add	A06, A06, #32
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q12, q13, [A07]
+	ldp	q14, q15, [A08]
+	add	A07, A07, #32
+	add	A08, A08, #32
+
+	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	ldr	q2, [A03]
+	ldr	q3, [A04]
+	add	A01, A01, #16
+	add	A02, A02, #16
+	add	A03, A03, #16
+	add	A04, A04, #16
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
+	add	B02, B02, #64
+
+	ldr	q4, [A05]
+	ldr	q5, [A06]
+	ldr	q6, [A07]
+	ldr	q7, [A08]
+
+	add	A05, A05, #16
+	add	A06, A06, #16
+	add	A07, A07, #16
+	add	A08, A08, #16
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B02]
+	add	B02, B02, #64
+.endm
+
+.macro COPY2x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	ldr	d2, [A03]
+	ldr	d3, [A04]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	add	A03, A03, #8
+	add	A04, A04, #8
+
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+	stp	d2, d3, [B03]
+	add	B03, B03, #16
+
+	ldr	d4, [A05]
+	ldr	d5, [A06]
+	ldr	d6, [A07]
+	ldr	d7, [A08]
+	
+	add	A05, A05, #8
+	add	A06, A06, #8
+	add	A07, A07, #8
+	add	A08, A08, #8
+
+	stp	d4, d5, [B03]
+	add	B03, B03, #16
+	stp	d6, d7, [B03]
+	add	B03, B03, #16
+
+.endm
+
+.macro COPY1x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	ldr	s2, [A03]
+	ldr	s3, [A04]
+
+	stp	s0, s1, [B04]
+	add	B04, B04, #8
+	stp	s2, s3, [B04]
+	add	B04, B04, #8
+
+	ldr	s4, [A05]
+	ldr	s5, [A06]
+	ldr	s6, [A07]
+	ldr	s7, [A08]
+
+	stp	s4, s5, [B04]
+	add	B04, B04, #8
+	stp	s6, s7, [B04]
+	add	B04, B04, #8
+
+.endm
+
+/*************************************************************************************************************************/
+.macro COPY16x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	TEMP1, B00, #64
+
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add	A02, A02, #64
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	ld1	{v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
+	add	A03, A03, #64
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	ld1	{v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
+	add	A04, A04, #64
+
+	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
+
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+	add	A03, A03, #32
+	add	A04, A04, #32
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	ldr	q2, [A03]
+	ldr	q3, [A04]
+	add	A01, A01, #16
+	add	A02, A02, #16
+	add	A03, A03, #16
+	add	A04, A04, #16
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
+
+	add	B02, B02, #64
+.endm
+
+.macro COPY2x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	ldr	d2, [A03]
+	ldr	d3, [A04]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	add	A03, A03, #8
+	add	A04, A04, #8
+	
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+	stp	d2, d3, [B03]
+
+	add	B03, B03, #16
+.endm
+
+.macro COPY1x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	ldr	s2, [A03]
+	ldr	s3, [A04]
+	
+	add	A01, A01, #4
+	add	A02, A02, #4
+	add	A03, A03, #4
+	add	A04, A04, #4
+
+	stp	s0, s1, [B04]
+	add	B04, B04, #8
+	stp	s2, s3, [B04]
+	add	B04, B04, #8
+
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+	
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add	A02, A02, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	TEMP1, B00, #64
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s}, [A01]
+	ld1	{v2.4s, v3.4s}, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	add	A01, A01, #16
+	add	A02, A02, #16
+
+	stp	q0, q1, [B02]
+	add	B02, B02, #32
+.endm
+
+.macro COPY2x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+.endm
+
+.macro COPY1x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	
+	add	A01, A01, #4
+	add	A02, A02, #4
+
+	stp	s0, s1, [B04]
+
+	add	B04, B04, #8
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	
+	ldp	q0, q1, [A01]
+	add	A01, A01, #32
+	stp	q0, q1, [B01]
+
+	add	B01, B01, #32
+.endm
+
+.macro COPY4x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr q0, [A01]
+	add	A01, A01, #16
+	str q0, [B02]
+
+	add	B02, B02, #16
+.endm
+
+.macro COPY2x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	add	A01, A01, #8
+	str d0, [B03]
+
+	add	B03, B03, #8
+.endm
+
+.macro COPY1x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	add	A01, A01, #4
+	str	s0, [B04]
+
+	add	B04, B04, #4
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	SAVE_REGS
+
+	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
+
+	lsl	TEMP1, M, #2					// TEMP1 = M * SIZE
+
+	and	B01 , N , #-16
+	and	B02 , N , #-8
+	and	B03 , N , #-4
+	and	B04 , N , #-2
+
+	mul	B01, B01, TEMP1
+	mul	B02, B02, TEMP1
+	mul	B03, B03, TEMP1
+	mul	B04, B04, TEMP1
+
+	add	B01 , B01, B
+	add	B02 , B02, B
+	add	B03 , B03, B
+	add	B04 , B04, B
+
+	lsl	M8, M, #6					// M8 = M * 16 * SIZE
+
+.Lsgemm_tcopy_L8_BEGIN:
+	asr 	J, M, #3					// J = M / 8
+	cmp 	J, #0
+	ble	.Lsgemm_tcopy_L4_BEGIN
+
+	.align	5
+.Lsgemm_tcopy_L8_M16_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A05, A04, LDA
+	add	A06, A05, LDA
+	add	A07, A06, LDA
+	add	A08, A07, LDA
+	add	A, A08, LDA
+
+	mov	B00, B
+	add	B, B00, #512					// B = B + 8 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L8_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L8_M16_20:
+
+	COPY16x8
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L8_M16_20
+
+.Lsgemm_tcopy_L8_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L8_M16_60
+
+	COPY8x8
+	
+.Lsgemm_tcopy_L8_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L8_M16_80
+
+	COPY4x8
+
+.Lsgemm_tcopy_L8_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L8_M16_100
+
+	COPY2x8
+
+.Lsgemm_tcopy_L8_M16_100:
+
+	tst	N, #1
+	ble	.Lsgemm_tcopy_L8_M16_END
+
+	COPY1x8
+
+.Lsgemm_tcopy_L8_M16_END:
+
+	subs	J , J, #1						// j--
+	bne	.Lsgemm_tcopy_L8_M16_BEGIN
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L4_BEGIN:
+	tst	M, #7
+	ble	.Lsgemm_tcopy_L999
+
+	tst	M, #4
+	ble	.Lsgemm_tcopy_L2_BEGIN
+	
+.Lsgemm_tcopy_L4_M16_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A, A04, LDA
+
+	mov	B00, B
+	add	B, B00, #256					// B = B + 4 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L4_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L4_M16_20:
+
+	COPY16x4
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L4_M16_20
+
+.Lsgemm_tcopy_L4_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L4_M16_60
+
+	COPY8x4
+	
+.Lsgemm_tcopy_L4_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L4_M16_80
+
+	COPY4x4
+
+.Lsgemm_tcopy_L4_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L4_M16_100
+
+	COPY2x4
+
+
+.Lsgemm_tcopy_L4_M16_100:
+
+	tst	N, #1
+	ble	.Lsgemm_tcopy_L4_M16_END
+
+	COPY1x4
+
+
+.Lsgemm_tcopy_L4_M16_END:
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L2_BEGIN:
+
+	tst	M, #3
+	ble	.Lsgemm_tcopy_L999
+
+	tst	M, #2
+	ble	.Lsgemm_tcopy_L1_BEGIN
+
+.Lsgemm_tcopy_L2_M16_BEGIN:
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A, A02, LDA
+
+	mov	B00, B
+	add	B, B00, #128					// B = B + 2 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L2_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L2_M16_20:
+
+	COPY16x2
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L2_M16_20
+
+.Lsgemm_tcopy_L2_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L2_M16_60
+
+	COPY8x2
+
+.Lsgemm_tcopy_L2_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L2_M16_80
+
+	COPY4x2
+
+.Lsgemm_tcopy_L2_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L2_M16_100
+
+	COPY2x2
+
+.Lsgemm_tcopy_L2_M16_100:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L2_M16_END
+
+	COPY1x2
+
+.Lsgemm_tcopy_L2_M16_END:
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L1_BEGIN:
+
+	tst	M, #1
+	ble	.Lsgemm_tcopy_L999
+
+
+.Lsgemm_tcopy_L1_M16_BEGIN:
+
+	mov	A01, A						// A01 = A
+	mov	B00, B
+
+	asr	I, N, #4					// I = M / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L1_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L1_M16_20:
+
+	COPY16x1
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L1_M16_20
+	
+.Lsgemm_tcopy_L1_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L1_M16_60
+
+	COPY8x1
+
+.Lsgemm_tcopy_L1_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L1_M16_80
+
+	COPY4x1
+
+.Lsgemm_tcopy_L1_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L1_M16_100
+
+	COPY2x1
+
+.Lsgemm_tcopy_L1_M16_100:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L1_M16_END
+
+	COPY1x1
+
+
+.Lsgemm_tcopy_L1_M16_END:
+
+.Lsgemm_tcopy_L999:
+	mov	x0, #0						// set return value
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
+
+
diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S
index 4b5c2fa31..dfe17f3ef 100644
--- a/kernel/power/cgemm_kernel_power9.S
+++ b/kernel/power/cgemm_kernel_power9.S
@@ -1,293 +1,293 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* Abdelrauf(quickwritereader@gmail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
- 
-#define LOAD	ld
-#define STACKSIZE  (512 )  
-#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
-#define	M	r3
-#define	N	r4
-#define	K	r5
-
-
-#define A	r8
-#define	B	r9
-#define	C	r10
-#define	LDC	r6
-#define OFFSET	r7
-
-
-#define alpha_r vs19
-#define alpha_i vs20
-#define save_permute_1 vs21
-#define permute_mask vs22
-#define o0	0
- 
-
-#define T1	r11
-#define T2	r12
-#define T3	r14
-#define T4	r15
-#define T5	r16
-#define T6	r17
-#define L	r18
-#define T7	r19
-#define T8	r20
-#define TEMP_REG	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO 	r26
-#define T9	r27
-#define	T10	r28
-#define	PRE	r29
-
-#define T12	r30
-#define T13	r31
-
-#include "cgemm_macros_power9.S"
-
-.equ    perm_const1, 0x0405060700010203
-.equ    perm_const2, 0x0c0d0e0f08090a0b
-.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
-.equ save_permute_11, 0x0405060714151617
-
-
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-
-	addi	SP, SP, -STACKSIZE
-	mflr r0
-
-
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
-
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-  stxv    vs52,  288(SP)
-  stxv    vs53,  304(SP)
-  stxv    vs54,  320(SP)
-  stxv    vs55,  336(SP)
-  stxv    vs56,  352(SP)
-  stxv    vs57,  368(SP)
-  stxv    vs58,  384(SP)
-  stxv    vs59,  400(SP)
-  stxv    vs60,  416(SP)
-  stxv    vs61,  432(SP)
-  stxv    vs62,  448(SP)
-  stxv    vs63,  464(SP)
-  std     r0,   FLINK_SAVE(SP)
- 
-
-
-	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
-
-
-
-#ifdef TRMMKERNEL
-	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
-#endif
-   slwi    LDC, LDC, ZBASE_SHIFT
-
- 
- 
-	/*alpha is stored in f1. convert to single and splat*/
-    xscvdpspn alpha_r,vs1 
-    xscvdpspn alpha_i,vs2 
-	xxspltw   alpha_r,alpha_r,0 
-	xxspltw   alpha_i,alpha_i,0 
-/*load reverse permute mask for big endian
-  uint128 = 0xc0d0e0f08090a0b0405060700010203
-*/ 
-		
-	lis T2, perm_const2@highest
-	lis T1, perm_const1@highest
-	lis T3, save_permute_12@highest
-	lis T4, save_permute_11@highest
-
-	
-	ori T2, T2, perm_const2@higher
-	ori T1, T1, perm_const1@higher
-	ori T3, T3, save_permute_12@higher
-	ori T4, T4, save_permute_11@higher
-
-	
-	rldicr T2, T2, 32, 31
-	rldicr T1, T1, 32, 31
-	rldicr T3, T3, 32, 31
-	rldicr T4, T4, 32, 31 
-
-	oris T2, T2, perm_const2@h
-	oris T1, T1, perm_const1@h
-	oris T3, T3, save_permute_12@h
-	oris T4, T4, save_permute_11@h
-
-	
-	ori T2, T2, perm_const2@l  
-	ori T1, T1, perm_const1@l
-	ori T3, T3, save_permute_12@l  
-	ori T4, T4, save_permute_11@l
-
-	
-  li r0,0
-  li PRE,512
-
-#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
-/*negate for this case as we will use addition -1*(a+b) */
-  xvnegsp alpha_r,alpha_r
-  xvnegsp alpha_i,alpha_i
-#endif
-
-	mtvsrdd permute_mask,T2,T1
-	mtvsrdd save_permute_1,T3,T4 	
-
-     /*mask is reverse permute so we have to make it inner permute */
- 	xxpermdi	permute_mask,	permute_mask,	permute_mask,2 
-
-#include "cgemm_logic_power9.S"
-
-.L999: 
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
-
-	ld    r0, 	 FLINK_SAVE(SP)	
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP)
-	mtlr r0
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE 
-	blr
-
-
-	EPILOGUE
-#endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+
+
+#define alpha_r vs19
+#define alpha_i vs20
+#define save_permute_1 vs21
+#define permute_mask vs22
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	PRE	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "cgemm_macros_power9.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
+.equ save_permute_11, 0x0405060714151617
+
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+
+	addi	SP, SP, -STACKSIZE
+	mflr r0
+
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
+ 
+
+
+	ld	LDC, FRAMESLOT(0) + STACKSIZE(SP)
+
+
+
+#ifdef TRMMKERNEL
+	ld	OFFSET,  FRAMESLOT(1) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, ZBASE_SHIFT
+
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+    xscvdpspn alpha_r,vs1 
+    xscvdpspn alpha_i,vs2 
+	xxspltw   alpha_r,alpha_r,0 
+	xxspltw   alpha_i,alpha_i,0 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	lis T1, perm_const1@highest
+	lis T3, save_permute_12@highest
+	lis T4, save_permute_11@highest
+
+	
+	ori T2, T2, perm_const2@higher
+	ori T1, T1, perm_const1@higher
+	ori T3, T3, save_permute_12@higher
+	ori T4, T4, save_permute_11@higher
+
+	
+	rldicr T2, T2, 32, 31
+	rldicr T1, T1, 32, 31
+	rldicr T3, T3, 32, 31
+	rldicr T4, T4, 32, 31 
+
+	oris T2, T2, perm_const2@h
+	oris T1, T1, perm_const1@h
+	oris T3, T3, save_permute_12@h
+	oris T4, T4, save_permute_11@h
+
+	
+	ori T2, T2, perm_const2@l  
+	ori T1, T1, perm_const1@l
+	ori T3, T3, save_permute_12@l  
+	ori T4, T4, save_permute_11@l
+
+	
+  li r0,0
+  li PRE,512
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegsp alpha_r,alpha_r
+  xvnegsp alpha_i,alpha_i
+#endif
+
+	mtvsrdd permute_mask,T2,T1
+	mtvsrdd save_permute_1,T3,T4 	
+
+     /*mask is reverse permute so we have to make it inner permute */
+ 	xxpermdi	permute_mask,	permute_mask,	permute_mask,2 
+
+#include "cgemm_logic_power9.S"
+
+.L999: 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S
index b4f937e90..a191219fa 100644
--- a/kernel/power/cgemm_logic_power9.S
+++ b/kernel/power/cgemm_logic_power9.S
@@ -1,2816 +1,2816 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* Abdelrauf(quickwritereader@gmail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-#define MY_ALIGN .align 3
-b CGEMM_L4
-/*                MINI SUBROUTINES                            */      
-/*                4x8 MAIN 128x+2 LOOP                     */      
-
-
-CGEMM_L4x8_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD4x8_2 
-    MY_ALIGN
-CGEMM_L4x8_LOOP:
-/*----------------------------------------*/   
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL4x8_L2 128,64,0,0 
-CGEMM_L4x8_K128:
-/*----------------------------------------*/   
-    KERNEL4x8_L2 128,64,1,0
-    dcbt    AO, T2  
-    KERNEL4x8_L2 128,64,2,0
-    KERNEL4x8_L2 128,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL4x8_L2 128,64,4,0
-    KERNEL4x8_L2 128,64,5,0
-    dcbt    AO, T4  
-    KERNEL4x8_L2 128,64,6,0
-    KERNEL4x8_L2 128,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL4x8_L2 128,64,8,0
-    KERNEL4x8_L2 128,64,9,0
-    KERNEL4x8_L2 128,64,10,0
-    KERNEL4x8_L2 128,64,11,0  
-    dcbt    BO, T4
-    KERNEL4x8_L2 128,64,12,0
-    KERNEL4x8_L2 128,64,13,0
-    KERNEL4x8_L2 128,64,14,0
-    KERNEL4x8_L2 128,64,15,0  
-    KERNEL4x8_L2 128,64,16,0
-    KERNEL4x8_L2 128,64,17,0 
-    KERNEL4x8_L2 128,64,18,0
-    KERNEL4x8_L2 128,64,19,0  
-    KERNEL4x8_L2 128,64,20,0
-    KERNEL4x8_L2 128,64,21,0 
-    KERNEL4x8_L2 128,64,22,0
-    KERNEL4x8_L2 128,64,23,0   
-    KERNEL4x8_L2 128,64,24,0
-    KERNEL4x8_L2 128,64,25,0
-    KERNEL4x8_L2 128,64,26,0
-    KERNEL4x8_L2 128,64,27,0  
-    KERNEL4x8_L2 128,64,28,0
-    KERNEL4x8_L2 128,64,29,0
-    KERNEL4x8_L2 128,64,30,0
-    KERNEL4x8_L2 128,64,31,0 
-    KERNEL4x8_L2 128,64,32,0
-    KERNEL4x8_L2 128,64,33,0
-    KERNEL4x8_L2 128,64,34,0
-    KERNEL4x8_L2 128,64,35,0 
-    KERNEL4x8_L2 128,64,36,0
-    KERNEL4x8_L2 128,64,37,0
-    KERNEL4x8_L2 128,64,38,0
-    KERNEL4x8_L2 128,64,39,0  
-    KERNEL4x8_L2 128,64,40,0
-    KERNEL4x8_L2 128,64,41,0
-    KERNEL4x8_L2 128,64,42,0
-    KERNEL4x8_L2 128,64,43,0  
-    KERNEL4x8_L2 128,64,44,0
-    KERNEL4x8_L2 128,64,45,0
-    KERNEL4x8_L2 128,64,46,0
-    KERNEL4x8_L2 128,64,47,0 
-    KERNEL4x8_L2 128,64,48,0
-    KERNEL4x8_L2 128,64,49,0 
-    KERNEL4x8_L2 128,64,50,0
-    KERNEL4x8_L2 128,64,51,0  
-    KERNEL4x8_L2 128,64,52,0
-    KERNEL4x8_L2 128,64,53,0 
-    KERNEL4x8_L2 128,64,54,0
-    KERNEL4x8_L2 128,64,55,0  
-    KERNEL4x8_L2 128,64,56,0
-    KERNEL4x8_L2 128,64,57,0
-    KERNEL4x8_L2 128,64,58,0
-    KERNEL4x8_L2 128,64,59,0  
-    KERNEL4x8_L2 128,64,60,0
-    KERNEL4x8_L2 128,64,61,0
-    KERNEL4x8_L2 128,64,62,0 
-    KERNEL4x8_L2 128,64,63,1  
-    bdnz    CGEMM_L4x8_LOOP
-    MY_ALIGN  
-CGEMM_L4x8_LOOP_END:
-/*----------------------------------------*/   
-    END4x8_2
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x8_L64_SUB:
-/*----------------------------------------*/   
-    LOAD4x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL4x8_L2 128,64,0,0 
-    KERNEL4x8_L2 128,64,1,0
-    dcbt    AO, T2  
-    KERNEL4x8_L2 128,64,2,0
-    KERNEL4x8_L2 128,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL4x8_L2 128,64,4,0
-    KERNEL4x8_L2 128,64,5,0
-    dcbt    AO, T4  
-    KERNEL4x8_L2 128,64,6,0
-    KERNEL4x8_L2 128,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL4x8_L2 128,64,8,0
-    KERNEL4x8_L2 128,64,9,0
-    KERNEL4x8_L2 128,64,10,0
-    KERNEL4x8_L2 128,64,11,0  
-    dcbt    BO, T4
-    KERNEL4x8_L2 128,64,12,0
-    KERNEL4x8_L2 128,64,13,0
-    KERNEL4x8_L2 128,64,14,0
-    KERNEL4x8_L2 128,64,15,0  
-    KERNEL4x8_L2 128,64,16,0
-    KERNEL4x8_L2 128,64,17,0 
-    KERNEL4x8_L2 128,64,18,0
-    KERNEL4x8_L2 128,64,19,0  
-    KERNEL4x8_L2 128,64,20,0
-    KERNEL4x8_L2 128,64,21,0 
-    KERNEL4x8_L2 128,64,22,0
-    KERNEL4x8_L2 128,64,23,0   
-    KERNEL4x8_L2 128,64,24,0
-    KERNEL4x8_L2 128,64,25,0
-    KERNEL4x8_L2 128,64,26,0
-    KERNEL4x8_L2 128,64,27,0  
-    KERNEL4x8_L2 128,64,28,0
-    KERNEL4x8_L2 128,64,29,0
-    KERNEL4x8_L2 128,64,30,0
-    KERNEL4x8_E2 128,64,31,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x8_L32_SUB:
-/*----------------------------------------*/   
-    LOAD4x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL4x8_L2 128,64,0,0 
-    KERNEL4x8_L2 128,64,1,0
-    dcbt    AO, T2  
-    KERNEL4x8_L2 128,64,2,0
-    KERNEL4x8_L2 128,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL4x8_L2 128,64,4,0
-    KERNEL4x8_L2 128,64,5,0
-    dcbt    AO, T4  
-    KERNEL4x8_L2 128,64,6,0
-    KERNEL4x8_L2 128,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL4x8_L2 128,64,8,0
-    KERNEL4x8_L2 128,64,9,0
-    KERNEL4x8_L2 128,64,10,0
-    KERNEL4x8_L2 128,64,11,0  
-    dcbt    BO, T4
-    KERNEL4x8_L2 128,64,12,0
-    KERNEL4x8_L2 128,64,13,0
-    KERNEL4x8_L2 128,64,14,0
-    KERNEL4x8_E2 128,64,15,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x8_L16_SUB:
-/*----------------------------------------*/   
-    LOAD4x8_2 
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL4x8_L2 128,64,0,0 
-    KERNEL4x8_L2 128,64,1,0
-    dcbt    AO, T2  
-    KERNEL4x8_L2 128,64,2,0
-    KERNEL4x8_L2 128,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL4x8_L2 128,64,4,0
-    KERNEL4x8_L2 128,64,5,0
-    dcbt    AO, T4  
-    KERNEL4x8_L2 128,64,6,0
-    KERNEL4x8_E2 128,64,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x4_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD4x4_2  
-    MY_ALIGN
-CGEMM_L4x4_LOOP:
-/*----------------------------------------*/   
-    KERNEL4x4_L2 64,64,0,0
-CGEMM_L4x4_K32:
-/*----------------------------------------*/   
-    KERNEL4x4_L2 64,64,1,0   
-    KERNEL4x4_L2 64,64,2,0
-    KERNEL4x4_L2 64,64,3,0  
-    KERNEL4x4_L2 64,64,4,0
-    KERNEL4x4_L2 64,64,5,0 
-    KERNEL4x4_L2 64,64,6,0
-    KERNEL4x4_L2 64,64,7,0
-    KERNEL4x4_L2 64,64,8,0
-    KERNEL4x4_L2 64,64,9,0   
-    KERNEL4x4_L2 64,64,10,0
-    KERNEL4x4_L2 64,64,11,0  
-    KERNEL4x4_L2 64,64,12,0
-    KERNEL4x4_L2 64,64,13,0 
-    KERNEL4x4_L2 64,64,14,0
-    KERNEL4x4_L2 64,64,15,1    
-    bdnz    CGEMM_L4x4_LOOP
-    MY_ALIGN  
-CGEMM_L4x4_LOOP_END:
-/*----------------------------------------*/   
-    END4x4_2 
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x4_L16_SUB:
-/*----------------------------------------*/   
-    LOAD4x4_2
-    KERNEL4x4_L2 64,64,0,0
-    KERNEL4x4_L2 64,64,1,0   
-    KERNEL4x4_L2 64,64,2,0
-    KERNEL4x4_L2 64,64,3,0  
-    KERNEL4x4_L2 64,64,4,0
-    KERNEL4x4_L2 64,64,5,0 
-    KERNEL4x4_L2 64,64,6,0
-    KERNEL4x4_E2 64,64,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x4_L8_SUB:
-/*----------------------------------------*/   
-    LOAD4x4_2
-    KERNEL4x4_L2 64,64,0,0
-    KERNEL4x4_L2 64,64,1,0   
-    KERNEL4x4_L2 64,64,2,0
-    KERNEL4x4_E2 64,64,3,1 
-    blr
-
-
-CGEMM_4x2_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD4x2_2  
-    MY_ALIGN 
-CGEMM_L4x2_LOOP:
-/*----------------------------------------*/   
-    KERNEL4x2_L2 32,64,0,0 
-CGEMM_L4x2_K32:
-/*----------------------------------------*/   
-    KERNEL4x2_L2 32,64,1,0  
-    KERNEL4x2_L2 32,64,2,0
-    KERNEL4x2_L2 32,64,3,0  
-    KERNEL4x2_L2 32,64,4,0
-    KERNEL4x2_L2 32,64,5,0 
-    KERNEL4x2_L2 32,64,6,0
-    KERNEL4x2_L2 32,64,7,0
-    KERNEL4x2_L2 32,64,8,0
-    KERNEL4x2_L2 32,64,9,0  
-    KERNEL4x2_L2 32,64,10,0
-    KERNEL4x2_L2 32,64,11,0  
-    KERNEL4x2_L2 32,64,12,0
-    KERNEL4x2_L2 32,64,13,0 
-    KERNEL4x2_L2 32,64,14,0
-    KERNEL4x2_L2 32,64,15,1   
-    bdnz    CGEMM_L4x2_LOOP
-    MY_ALIGN  
-
-
-CGEMM_L4x2_LOOP_END:
-/*----------------------------------------*/   
-    END4x2_2 
-    blr
-    MY_ALIGN
-CGEMM_4x2_L16_SUB:
-/*----------------------------------------*/   
-    LOAD4x2_2
-    KERNEL4x2_L2 32,64,0,0
-    KERNEL4x2_L2 32,64,1,0  
-    KERNEL4x2_L2 32,64,2,0
-    KERNEL4x2_L2 32,64,3,0  
-    KERNEL4x2_L2 32,64,4,0
-    KERNEL4x2_L2 32,64,5,0 
-    KERNEL4x2_L2 32,64,6,0
-    KERNEL4x2_E2 32,64,7,1
-    blr
-    MY_ALIGN
-CGEMM_4x2_L8_SUB:
-/*----------------------------------------*/   
-    LOAD4x2_2
-    KERNEL4x2_L2 32,64,0,0
-    KERNEL4x2_L2 32,64,1,0  
-    KERNEL4x2_L2 32,64,2,0
-    KERNEL4x2_E2 32,64,3,1  
-    blr
-
-
-CGEMM_4x1_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD4x1_2  
-    MY_ALIGN
-CGEMM_L4x1_LOOP:
-/*----------------------------------------*/   
-    KERNEL4x1_L2 16,64,0,0 
-CGEMM_L4x1_K32:
-/*----------------------------------------*/   
-    KERNEL4x1_L2 16,64,1,0  
-    KERNEL4x1_L2 16,64,2,0
-    KERNEL4x1_L2 16,64,3,0  
-    KERNEL4x1_L2 16,64,4,0
-    KERNEL4x1_L2 16,64,5,0 
-    KERNEL4x1_L2 16,64,6,0
-    KERNEL4x1_L2 16,64,7,0
-    KERNEL4x1_L2 16,64,8,0
-    KERNEL4x1_L2 16,64,9,0  
-    KERNEL4x1_L2 16,64,10,0
-    KERNEL4x1_L2 16,64,11,0  
-    KERNEL4x1_L2 16,64,12,0
-    KERNEL4x1_L2 16,64,13,0 
-    KERNEL4x1_L2 16,64,14,0
-    KERNEL4x1_L2 16,64,15,1   
-    bdnz    CGEMM_L4x1_LOOP
-    MY_ALIGN  
-CGEMM_L4x1_LOOP_END:
-/*----------------------------------------*/   
-    END4x1_2 
-    blr
-
-    MY_ALIGN
-CGEMM_4x1_L16_SUB:
-/*----------------------------------------*/   
-    LOAD4x1_2
-    KERNEL4x1_L2 16,64,0,0
-    KERNEL4x1_L2 16,64,1,0  
-    KERNEL4x1_L2 16,64,2,0
-    KERNEL4x1_L2 16,64,3,0  
-    KERNEL4x1_L2 16,64,4,0
-    KERNEL4x1_L2 16,64,5,0 
-    KERNEL4x1_L2 16,64,6,0
-    KERNEL4x1_E2 16,64,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_4x1_L8_SUB:
-/*----------------------------------------*/   
-    LOAD4x1_2
-    KERNEL4x1_L2 16,64,0,0
-    KERNEL4x1_L2 16,64,1,0  
-    KERNEL4x1_L2 16,64,2,0
-    KERNEL4x1_E2 16,64,3,1  
-    blr
-
-
-
-/*             MAIN LOOP BEGINS               */   
-    MY_ALIGN
-
-
-CGEMM_L4:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    neg TEMP_REG, OFFSET 
-#endif   
-    srawi.    J,  N,  2
-    ble   CGEMM_L4_END
-
-
-CGEMM_L4_BEGIN:
-/*----------------------------------------*/   
-    mr    CO, C
-    slwi    T1, LDC , 2     
-    add     T2,C,LDC    
-    mr    AO, A  
-    add   C,  C,  T1
-#if defined(TRMMKERNEL) && defined(LEFT)   
-    mr TEMP_REG, OFFSET  /*off = offset;*/
-#endif     
-    srawi.    I,  M,  3
-    ble   CGEMM_L4x8_END
-    dcbt    CO,r0  /*just prefetch*/
-    dcbt    T2,r0    
-
-
-CGEMM_L4x8_BEGIN:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
-#else    
-    mr    BO, B  
-    dcbt    B,  r0  
-#endif     
-    dcbt    AO, r0
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
-    mr T1, T6
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512   
-    srawi.   T8, T1, 7 /**(T1-2) % 128x */
-#else   
-    mr T1, K
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512 
-    srawi.   T8, T1, 7 /**(K-2) % 128x */
-#endif   
-    ZERO4x8  
-    ble   CGEMM_L4x8_SUB0
-    bl CGEMM_L4x8_LMAIN_SUB
-    andi.   L,  T1, 127
-    ble   CGEMM_L4x8_SAVE
-    b   CGEMM_L4x8_SUB2
-
-
-CGEMM_L4x8_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 255
-    cmpwi   T6,129
-#else   
-    andi.   L,  K,  255
-    cmpwi   K,129
-#endif       
-    li T8,1
-    bne CMP4x8_128K
-    addi BO,BO,-32
-    addi AO,AO,-64 
-    LOAD4x8O 64,32 
-    END4x8_WITHOUT_ADD   
-    LOAD4x8_2O  128, 64 
-    mtctr   T8    
-    bl CGEMM_L4x8_K128   
-    b CGEMM_L4x8_SAVE  
-    CMP4x8_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne CGEMM_L4x8_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-128   
-    LOAD4x8_2O 128,64
-    bl CGEMM_L4x8_K128   
-    b CGEMM_L4x8_SAVE 
-    MY_ALIGN
-
-
-CGEMM_L4x8_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 64
-    ble CGEMM_L4x8_SUB2_32
-    bl  CGEMM_4x8_L64_SUB
-    MY_ALIGN
-
-
-CGEMM_L4x8_SUB2_32:
-/*----------------------------------------*/   
-    andi.      T1,L, 32
-    ble CGEMM_L4x8_SUB2_16    
-    bl  CGEMM_4x8_L32_SUB
-    MY_ALIGN 
-
-
-CGEMM_L4x8_SUB2_16:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L4x8_SUB2_8
-    bl  CGEMM_4x8_L16_SUB  
-    MY_ALIGN    
-
-
-CGEMM_L4x8_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L4x8_SUB2_4
-    LOAD4x8_2
-    KERNEL4x8_L2  128,64, 0,0
-    KERNEL4x8_L2  128,64, 1,0
-    KERNEL4x8_L2  128,64, 2,0
-    KERNEL4x8_E2  128,64, 3,1
-    MY_ALIGN   
-
-
-CGEMM_L4x8_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L4x8_SUB2_2
-    LOAD4x8_2
-    KERNEL4x8_L2  128,64, 0,0
-    KERNEL4x8_E2  128,64, 1,1
-    MY_ALIGN
-
-
-CGEMM_L4x8_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L4x8_SUB2_1
-    LOAD4x8_2 
-    KERNEL4x8_E2  128,64, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L4x8_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L4x8_SAVE 
-    KERNEL4x8
-
-    MY_ALIGN
-CGEMM_L4x8_SAVE:
-/*----------------------------------------*/   
-    addic.    I,  I,  -1
-    MY_ALIGN
-    SAVE4x8
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
-#endif     
-    bgt   CGEMM_L4x8_BEGIN
-    andi.   T2, M,  7
-    ble   CGEMM_L4x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L4x4_END
-    b   CGEMM_L4x4_BEGIN
-    MY_ALIGN 
-
-
-CGEMM_L4x8_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L4x4_BEGIN:
-/*----------------------------------------*/   
-    andi.   T2, M,  7
-    ble   CGEMM_L4x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L4x4_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO4x4
-    ble   CGEMM_L4x4_SUB0 
-    bl CGEMM_4x4_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L4x4_SAVE
-    b    CGEMM_L4x4_SUB2
-
-
-CGEMM_L4x4_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP4x4_32K
-    addi BO,BO,-32
-    addi AO,AO,-32  
-    LOAD4x4O 32,32 
-    END4x4_WITHOUT_ADD   
-    LOAD4x4_2O  64, 64 
-    mtctr   T8    
-    bl CGEMM_L4x4_K32   
-    b CGEMM_L4x4_SAVE  
-    CMP4x4_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L4x4_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-64   
-    LOAD4x4_2O 64,64
-    bl CGEMM_L4x4_K32   
-    b CGEMM_L4x4_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L4x4_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L4x4_SUB2_8
-    bl  CGEMM_4x4_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L4x4_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L4x4_SUB2_4
-    bl CGEMM_4x4_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L4x4_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L4x4_SUB2_2
-    LOAD4x4_2
-    KERNEL4x4_L2  64,64, 0,0
-    KERNEL4x4_E2  64,64, 1,1
-    MY_ALIGN
-
-
-CGEMM_L4x4_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L4x4_SUB2_1
-    LOAD4x4_2
-    KERNEL4x4_E2  64,64, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L4x4_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L4x4_SAVE 
-    KERNEL4x4
-
-
-CGEMM_L4x4_SAVE:
-/*----------------------------------------*/   
-    SAVE4x4
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
-#endif     
-
-
-CGEMM_L4x4_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L4x2_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  2
-    ble   CGEMM_L4x2_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO4x2
-    ble   CGEMM_L4x2_SUB0 
-    bl CGEMM_4x2_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L4x2_SAVE
-    b   CGEMM_L4x2_SUB2
-
-
-CGEMM_L4x2_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP4x2_32K
-    addi BO,BO,-32
-    addi AO,AO,-16  
-    LOAD4x2O 16,32 
-    END4x2_WITHOUT_ADD   
-    LOAD4x2_2O  32, 64  
-    mtctr   T8    
-    bl CGEMM_L4x2_K32   
-    b CGEMM_L4x2_SAVE  
-    CMP4x2_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L4x2_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-32   
-    LOAD4x2_2O 32,64
-    bl CGEMM_L4x2_K32   
-    b CGEMM_L4x2_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L4x2_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L4x2_SUB2_8
-    bl CGEMM_4x2_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L4x2_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L4x2_SUB2_4
-    bl CGEMM_4x2_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L4x2_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L4x2_SUB2_2
-    LOAD4x2_2
-    KERNEL4x2_L2  32,64, 0,0
-    KERNEL4x2_E2  32,64, 1,1
-    MY_ALIGN
-
-
-CGEMM_L4x2_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L4x2_SUB2_1
-    LOAD4x2_2
-    KERNEL4x2_E2  32,64, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L4x2_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L4x2_SAVE 
-    KERNEL4x2
-
-    MY_ALIGN
-CGEMM_L4x2_SAVE:
-/*----------------------------------------*/   
-    SAVE4x2
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
-#endif     
-
-
-CGEMM_L4x2_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L4x1_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  1
-    ble   CGEMM_L4x1_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO4x1
-    ble   CGEMM_L4x1_SUB0 
-    bl CGEMM_4x1_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L4x1_SAVE
-    b   CGEMM_L4x1_SUB2
-
-
-CGEMM_L4x1_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP4x1_32K
-    addi BO,BO,-32
-    addi AO,AO,-8  
-    LOAD4x1O 8,32 
-    END4x1_WITHOUT_ADD   
-    LOAD4x1_2O  16, 64  
-    mtctr   T8    
-    bl CGEMM_L4x1_K32   
-    b CGEMM_L4x1_SAVE  
-    CMP4x1_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L4x1_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-16   
-    LOAD4x1_2O 16,64
-    bl CGEMM_L4x1_K32   
-    b CGEMM_L4x1_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L4x1_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L4x1_SUB2_8
-    bl CGEMM_4x1_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L4x1_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L4x1_SUB2_4
-    bl CGEMM_4x1_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L4x1_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L4x1_SUB2_2
-    LOAD4x1_2
-    KERNEL4x1_L2  16,64, 0,0
-    KERNEL4x1_E2  16,64, 1,1
-    MY_ALIGN
-
-
-CGEMM_L4x1_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L4x1_SUB2_1
-    LOAD4x1_2
-    KERNEL4x1_E2  16,64, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L4x1_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L4x1_SAVE 
-    KERNEL4x1
-
-    MY_ALIGN
-CGEMM_L4x1_SAVE:
-/*----------------------------------------*/  
-     
-    SAVE4x1
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
-#endif   
-
-
-CGEMM_L4x1_END:
-/*----------------------------------------*/   
-    slwi    T1, K,  5
-    addic.    J,  J,  -1
-    add   B,  B,  T1
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    addi TEMP_REG, TEMP_REG, 4
-#endif   
-    bgt   CGEMM_L4_BEGIN
-
-
-CGEMM_L4_END:
-
-b CGEMM_L2
-/*                MINI SUBROUTINES                            */      
-/*                2x8 MAIN 128x+2 LOOP                     */      
-
-
-CGEMM_L2x8_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x8_2 
-    MY_ALIGN
-CGEMM_L2x8_LOOP:
-/*----------------------------------------*/   
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 128,32,0,0 
-CGEMM_L2x8_K128:
-/*----------------------------------------*/   
-    KERNEL2x8_L2 128,32,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 128,32,2,0
-    KERNEL2x8_L2 128,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 128,32,4,0
-    KERNEL2x8_L2 128,32,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 128,32,6,0
-    KERNEL2x8_L2 128,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 128,32,8,0
-    KERNEL2x8_L2 128,32,9,0
-    KERNEL2x8_L2 128,32,10,0
-    KERNEL2x8_L2 128,32,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 128,32,12,0
-    KERNEL2x8_L2 128,32,13,0
-    KERNEL2x8_L2 128,32,14,0
-    KERNEL2x8_L2 128,32,15,0  
-    KERNEL2x8_L2 128,32,16,0
-    KERNEL2x8_L2 128,32,17,0 
-    KERNEL2x8_L2 128,32,18,0
-    KERNEL2x8_L2 128,32,19,0  
-    KERNEL2x8_L2 128,32,20,0
-    KERNEL2x8_L2 128,32,21,0 
-    KERNEL2x8_L2 128,32,22,0
-    KERNEL2x8_L2 128,32,23,0   
-    KERNEL2x8_L2 128,32,24,0
-    KERNEL2x8_L2 128,32,25,0
-    KERNEL2x8_L2 128,32,26,0
-    KERNEL2x8_L2 128,32,27,0  
-    KERNEL2x8_L2 128,32,28,0
-    KERNEL2x8_L2 128,32,29,0
-    KERNEL2x8_L2 128,32,30,0
-    KERNEL2x8_L2 128,32,31,0 
-    KERNEL2x8_L2 128,32,32,0
-    KERNEL2x8_L2 128,32,33,0
-    KERNEL2x8_L2 128,32,34,0
-    KERNEL2x8_L2 128,32,35,0 
-    KERNEL2x8_L2 128,32,36,0
-    KERNEL2x8_L2 128,32,37,0
-    KERNEL2x8_L2 128,32,38,0
-    KERNEL2x8_L2 128,32,39,0  
-    KERNEL2x8_L2 128,32,40,0
-    KERNEL2x8_L2 128,32,41,0
-    KERNEL2x8_L2 128,32,42,0
-    KERNEL2x8_L2 128,32,43,0  
-    KERNEL2x8_L2 128,32,44,0
-    KERNEL2x8_L2 128,32,45,0
-    KERNEL2x8_L2 128,32,46,0
-    KERNEL2x8_L2 128,32,47,0 
-    KERNEL2x8_L2 128,32,48,0
-    KERNEL2x8_L2 128,32,49,0 
-    KERNEL2x8_L2 128,32,50,0
-    KERNEL2x8_L2 128,32,51,0  
-    KERNEL2x8_L2 128,32,52,0
-    KERNEL2x8_L2 128,32,53,0 
-    KERNEL2x8_L2 128,32,54,0
-    KERNEL2x8_L2 128,32,55,0  
-    KERNEL2x8_L2 128,32,56,0
-    KERNEL2x8_L2 128,32,57,0
-    KERNEL2x8_L2 128,32,58,0
-    KERNEL2x8_L2 128,32,59,0  
-    KERNEL2x8_L2 128,32,60,0
-    KERNEL2x8_L2 128,32,61,0
-    KERNEL2x8_L2 128,32,62,0 
-    KERNEL2x8_L2 128,32,63,1  
-    bdnz    CGEMM_L2x8_LOOP
-    MY_ALIGN  
-CGEMM_L2x8_LOOP_END:
-/*----------------------------------------*/   
-    END2x8_2
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x8_L64_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 128,32,0,0 
-    KERNEL2x8_L2 128,32,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 128,32,2,0
-    KERNEL2x8_L2 128,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 128,32,4,0
-    KERNEL2x8_L2 128,32,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 128,32,6,0
-    KERNEL2x8_L2 128,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 128,32,8,0
-    KERNEL2x8_L2 128,32,9,0
-    KERNEL2x8_L2 128,32,10,0
-    KERNEL2x8_L2 128,32,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 128,32,12,0
-    KERNEL2x8_L2 128,32,13,0
-    KERNEL2x8_L2 128,32,14,0
-    KERNEL2x8_L2 128,32,15,0  
-    KERNEL2x8_L2 128,32,16,0
-    KERNEL2x8_L2 128,32,17,0 
-    KERNEL2x8_L2 128,32,18,0
-    KERNEL2x8_L2 128,32,19,0  
-    KERNEL2x8_L2 128,32,20,0
-    KERNEL2x8_L2 128,32,21,0 
-    KERNEL2x8_L2 128,32,22,0
-    KERNEL2x8_L2 128,32,23,0   
-    KERNEL2x8_L2 128,32,24,0
-    KERNEL2x8_L2 128,32,25,0
-    KERNEL2x8_L2 128,32,26,0
-    KERNEL2x8_L2 128,32,27,0  
-    KERNEL2x8_L2 128,32,28,0
-    KERNEL2x8_L2 128,32,29,0
-    KERNEL2x8_L2 128,32,30,0
-    KERNEL2x8_E2 128,32,31,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x8_L32_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 128,32,0,0 
-    KERNEL2x8_L2 128,32,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 128,32,2,0
-    KERNEL2x8_L2 128,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 128,32,4,0
-    KERNEL2x8_L2 128,32,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 128,32,6,0
-    KERNEL2x8_L2 128,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 128,32,8,0
-    KERNEL2x8_L2 128,32,9,0
-    KERNEL2x8_L2 128,32,10,0
-    KERNEL2x8_L2 128,32,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 128,32,12,0
-    KERNEL2x8_L2 128,32,13,0
-    KERNEL2x8_L2 128,32,14,0
-    KERNEL2x8_E2 128,32,15,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x8_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2 
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 128,32,0,0 
-    KERNEL2x8_L2 128,32,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 128,32,2,0
-    KERNEL2x8_L2 128,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 128,32,4,0
-    KERNEL2x8_L2 128,32,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 128,32,6,0
-    KERNEL2x8_E2 128,32,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x4_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x4_2  
-    MY_ALIGN
-CGEMM_L2x4_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x4_L2 64,32,0,0
-CGEMM_L2x4_K32:
-/*----------------------------------------*/   
-    KERNEL2x4_L2 64,32,1,0   
-    KERNEL2x4_L2 64,32,2,0
-    KERNEL2x4_L2 64,32,3,0  
-    KERNEL2x4_L2 64,32,4,0
-    KERNEL2x4_L2 64,32,5,0 
-    KERNEL2x4_L2 64,32,6,0
-    KERNEL2x4_L2 64,32,7,0
-    KERNEL2x4_L2 64,32,8,0
-    KERNEL2x4_L2 64,32,9,0   
-    KERNEL2x4_L2 64,32,10,0
-    KERNEL2x4_L2 64,32,11,0  
-    KERNEL2x4_L2 64,32,12,0
-    KERNEL2x4_L2 64,32,13,0 
-    KERNEL2x4_L2 64,32,14,0
-    KERNEL2x4_L2 64,32,15,1    
-    bdnz    CGEMM_L2x4_LOOP
-    MY_ALIGN  
-CGEMM_L2x4_LOOP_END:
-/*----------------------------------------*/   
-    END2x4_2 
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x4_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x4_2
-    KERNEL2x4_L2 64,32,0,0
-    KERNEL2x4_L2 64,32,1,0   
-    KERNEL2x4_L2 64,32,2,0
-    KERNEL2x4_L2 64,32,3,0  
-    KERNEL2x4_L2 64,32,4,0
-    KERNEL2x4_L2 64,32,5,0 
-    KERNEL2x4_L2 64,32,6,0
-    KERNEL2x4_E2 64,32,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x4_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x4_2
-    KERNEL2x4_L2 64,32,0,0
-    KERNEL2x4_L2 64,32,1,0   
-    KERNEL2x4_L2 64,32,2,0
-    KERNEL2x4_E2 64,32,3,1 
-    blr
-
-
-CGEMM_2x2_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x2_2  
-    MY_ALIGN 
-CGEMM_L2x2_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x2_L2 32,32,0,0 
-CGEMM_L2x2_K32:
-/*----------------------------------------*/   
-    KERNEL2x2_L2 32,32,1,0  
-    KERNEL2x2_L2 32,32,2,0
-    KERNEL2x2_L2 32,32,3,0  
-    KERNEL2x2_L2 32,32,4,0
-    KERNEL2x2_L2 32,32,5,0 
-    KERNEL2x2_L2 32,32,6,0
-    KERNEL2x2_L2 32,32,7,0
-    KERNEL2x2_L2 32,32,8,0
-    KERNEL2x2_L2 32,32,9,0  
-    KERNEL2x2_L2 32,32,10,0
-    KERNEL2x2_L2 32,32,11,0  
-    KERNEL2x2_L2 32,32,12,0
-    KERNEL2x2_L2 32,32,13,0 
-    KERNEL2x2_L2 32,32,14,0
-    KERNEL2x2_L2 32,32,15,1   
-    bdnz    CGEMM_L2x2_LOOP
-    MY_ALIGN  
-
-
-CGEMM_L2x2_LOOP_END:
-/*----------------------------------------*/   
-    END2x2_2 
-    blr
-    MY_ALIGN
-CGEMM_2x2_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x2_2
-    KERNEL2x2_L2 32,32,0,0
-    KERNEL2x2_L2 32,32,1,0  
-    KERNEL2x2_L2 32,32,2,0
-    KERNEL2x2_L2 32,32,3,0  
-    KERNEL2x2_L2 32,32,4,0
-    KERNEL2x2_L2 32,32,5,0 
-    KERNEL2x2_L2 32,32,6,0
-    KERNEL2x2_E2 32,32,7,1
-    blr
-    MY_ALIGN
-CGEMM_2x2_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x2_2
-    KERNEL2x2_L2 32,32,0,0
-    KERNEL2x2_L2 32,32,1,0  
-    KERNEL2x2_L2 32,32,2,0
-    KERNEL2x2_E2 32,32,3,1  
-    blr
-
-
-CGEMM_2x1_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x1_2  
-    MY_ALIGN
-CGEMM_L2x1_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x1_L2 16,32,0,0 
-CGEMM_L2x1_K32:
-/*----------------------------------------*/   
-    KERNEL2x1_L2 16,32,1,0  
-    KERNEL2x1_L2 16,32,2,0
-    KERNEL2x1_L2 16,32,3,0  
-    KERNEL2x1_L2 16,32,4,0
-    KERNEL2x1_L2 16,32,5,0 
-    KERNEL2x1_L2 16,32,6,0
-    KERNEL2x1_L2 16,32,7,0
-    KERNEL2x1_L2 16,32,8,0
-    KERNEL2x1_L2 16,32,9,0  
-    KERNEL2x1_L2 16,32,10,0
-    KERNEL2x1_L2 16,32,11,0  
-    KERNEL2x1_L2 16,32,12,0
-    KERNEL2x1_L2 16,32,13,0 
-    KERNEL2x1_L2 16,32,14,0
-    KERNEL2x1_L2 16,32,15,1   
-    bdnz    CGEMM_L2x1_LOOP
-    MY_ALIGN  
-CGEMM_L2x1_LOOP_END:
-/*----------------------------------------*/   
-    END2x1_2 
-    blr
-
-    MY_ALIGN
-CGEMM_2x1_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x1_2
-    KERNEL2x1_L2 16,32,0,0
-    KERNEL2x1_L2 16,32,1,0  
-    KERNEL2x1_L2 16,32,2,0
-    KERNEL2x1_L2 16,32,3,0  
-    KERNEL2x1_L2 16,32,4,0
-    KERNEL2x1_L2 16,32,5,0 
-    KERNEL2x1_L2 16,32,6,0
-    KERNEL2x1_E2 16,32,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_2x1_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x1_2
-    KERNEL2x1_L2 16,32,0,0
-    KERNEL2x1_L2 16,32,1,0  
-    KERNEL2x1_L2 16,32,2,0
-    KERNEL2x1_E2 16,32,3,1  
-    blr
-
-
-
-/*             MAIN LOOP BEGINS               */   
-    MY_ALIGN
-
-
-CGEMM_L2:
-/*----------------------------------------*/   
-
-    andi.    J,  N,  2
-    ble   CGEMM_L2_END
-
-
-CGEMM_L2_BEGIN:
-/*----------------------------------------*/   
-    mr    CO, C
-    slwi    T1, LDC , 1     
-    add     T2,C,LDC    
-    mr    AO, A  
-    add   C,  C,  T1
-#if defined(TRMMKERNEL) && defined(LEFT)   
-    mr TEMP_REG, OFFSET  /*off = offset;*/
-#endif     
-    srawi.    I,  M,  3
-    ble   CGEMM_L2x8_END
-    dcbt    CO,r0  /*just prefetch*/
-    dcbt    T2,r0    
-
-
-CGEMM_L2x8_BEGIN:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
-#else    
-    mr    BO, B  
-    dcbt    B,  r0  
-#endif     
-    dcbt    AO, r0
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
-    mr T1, T6
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512   
-    srawi.   T8, T1, 7 /**(T1-2) % 128x */
-#else   
-    mr T1, K
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512 
-    srawi.   T8, T1, 7 /**(K-2) % 128x */
-#endif   
-    ZERO2x8  
-    ble   CGEMM_L2x8_SUB0
-    bl CGEMM_L2x8_LMAIN_SUB
-    andi.   L,  T1, 127
-    ble   CGEMM_L2x8_SAVE
-    b   CGEMM_L2x8_SUB2
-
-
-CGEMM_L2x8_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 255
-    cmpwi   T6,129
-#else   
-    andi.   L,  K,  255
-    cmpwi   K,129
-#endif       
-    li T8,1
-    bne CMP2x8_128K
-    addi BO,BO,-16
-    addi AO,AO,-64 
-    LOAD2x8O 64,16 
-    END2x8_WITHOUT_ADD   
-    LOAD2x8_2O  128, 32 
-    mtctr   T8    
-    bl CGEMM_L2x8_K128   
-    b CGEMM_L2x8_SAVE  
-    CMP2x8_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne CGEMM_L2x8_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-128   
-    LOAD2x8_2O 128,32
-    bl CGEMM_L2x8_K128   
-    b CGEMM_L2x8_SAVE 
-    MY_ALIGN
-
-
-CGEMM_L2x8_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 64
-    ble CGEMM_L2x8_SUB2_32
-    bl  CGEMM_2x8_L64_SUB
-    MY_ALIGN
-
-
-CGEMM_L2x8_SUB2_32:
-/*----------------------------------------*/   
-    andi.      T1,L, 32
-    ble CGEMM_L2x8_SUB2_16    
-    bl  CGEMM_2x8_L32_SUB
-    MY_ALIGN 
-
-
-CGEMM_L2x8_SUB2_16:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L2x8_SUB2_8
-    bl  CGEMM_2x8_L16_SUB  
-    MY_ALIGN    
-
-
-CGEMM_L2x8_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L2x8_SUB2_4
-    LOAD2x8_2
-    KERNEL2x8_L2  128,32, 0,0
-    KERNEL2x8_L2  128,32, 1,0
-    KERNEL2x8_L2  128,32, 2,0
-    KERNEL2x8_E2  128,32, 3,1
-    MY_ALIGN   
-
-
-CGEMM_L2x8_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L2x8_SUB2_2
-    LOAD2x8_2
-    KERNEL2x8_L2  128,32, 0,0
-    KERNEL2x8_E2  128,32, 1,1
-    MY_ALIGN
-
-
-CGEMM_L2x8_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L2x8_SUB2_1
-    LOAD2x8_2 
-    KERNEL2x8_E2  128,32, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L2x8_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L2x8_SAVE 
-    KERNEL2x8
-
-    MY_ALIGN
-CGEMM_L2x8_SAVE:
-/*----------------------------------------*/   
-    addic.    I,  I,  -1
-    MY_ALIGN
-    SAVE2x8
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
-#endif     
-    bgt   CGEMM_L2x8_BEGIN
-    andi.   T2, M,  7
-    ble   CGEMM_L2x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L2x4_END
-    b   CGEMM_L2x4_BEGIN
-    MY_ALIGN 
-
-
-CGEMM_L2x8_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L2x4_BEGIN:
-/*----------------------------------------*/   
-    andi.   T2, M,  7
-    ble   CGEMM_L2x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L2x4_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x4
-    ble   CGEMM_L2x4_SUB0 
-    bl CGEMM_2x4_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L2x4_SAVE
-    b    CGEMM_L2x4_SUB2
-
-
-CGEMM_L2x4_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x4_32K
-    addi BO,BO,-16
-    addi AO,AO,-32  
-    LOAD2x4O 32,16 
-    END2x4_WITHOUT_ADD   
-    LOAD2x4_2O  64, 32 
-    mtctr   T8    
-    bl CGEMM_L2x4_K32   
-    b CGEMM_L2x4_SAVE  
-    CMP2x4_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L2x4_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-64   
-    LOAD2x4_2O 64,32
-    bl CGEMM_L2x4_K32   
-    b CGEMM_L2x4_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L2x4_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L2x4_SUB2_8
-    bl  CGEMM_2x4_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L2x4_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L2x4_SUB2_4
-    bl CGEMM_2x4_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L2x4_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L2x4_SUB2_2
-    LOAD2x4_2
-    KERNEL2x4_L2  64,32, 0,0
-    KERNEL2x4_E2  64,32, 1,1
-    MY_ALIGN
-
-
-CGEMM_L2x4_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L2x4_SUB2_1
-    LOAD2x4_2
-    KERNEL2x4_E2  64,32, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L2x4_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L2x4_SAVE 
-    KERNEL2x4
-
-
-CGEMM_L2x4_SAVE:
-/*----------------------------------------*/   
-    SAVE2x4
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
-#endif     
-
-
-CGEMM_L2x4_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L2x2_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  2
-    ble   CGEMM_L2x2_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x2
-    ble   CGEMM_L2x2_SUB0 
-    bl CGEMM_2x2_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L2x2_SAVE
-    b   CGEMM_L2x2_SUB2
-
-
-CGEMM_L2x2_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x2_32K
-    addi BO,BO,-16
-    addi AO,AO,-16  
-    LOAD2x2O 16,16 
-    END2x2_WITHOUT_ADD   
-    LOAD2x2_2O  32, 32  
-    mtctr   T8    
-    bl CGEMM_L2x2_K32   
-    b CGEMM_L2x2_SAVE  
-    CMP2x2_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L2x2_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-32   
-    LOAD2x2_2O 32,32
-    bl CGEMM_L2x2_K32   
-    b CGEMM_L2x2_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L2x2_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L2x2_SUB2_8
-    bl CGEMM_2x2_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L2x2_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L2x2_SUB2_4
-    bl CGEMM_2x2_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L2x2_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L2x2_SUB2_2
-    LOAD2x2_2
-    KERNEL2x2_L2  32,32, 0,0
-    KERNEL2x2_E2  32,32, 1,1
-    MY_ALIGN
-
-
-CGEMM_L2x2_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L2x2_SUB2_1
-    LOAD2x2_2
-    KERNEL2x2_E2  32,32, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L2x2_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L2x2_SAVE 
-    KERNEL2x2
-
-    MY_ALIGN
-CGEMM_L2x2_SAVE:
-/*----------------------------------------*/   
-    SAVE2x2
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
-#endif     
-
-
-CGEMM_L2x2_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L2x1_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  1
-    ble   CGEMM_L2x1_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x1
-    ble   CGEMM_L2x1_SUB0 
-    bl CGEMM_2x1_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L2x1_SAVE
-    b   CGEMM_L2x1_SUB2
-
-
-CGEMM_L2x1_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x1_32K
-    addi BO,BO,-16
-    addi AO,AO,-8  
-    LOAD2x1O 8,16 
-    END2x1_WITHOUT_ADD   
-    LOAD2x1_2O  16, 32  
-    mtctr   T8    
-    bl CGEMM_L2x1_K32   
-    b CGEMM_L2x1_SAVE  
-    CMP2x1_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L2x1_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-16   
-    LOAD2x1_2O 16,32
-    bl CGEMM_L2x1_K32   
-    b CGEMM_L2x1_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L2x1_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L2x1_SUB2_8
-    bl CGEMM_2x1_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L2x1_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L2x1_SUB2_4
-    bl CGEMM_2x1_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L2x1_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L2x1_SUB2_2
-    LOAD2x1_2
-    KERNEL2x1_L2  16,32, 0,0
-    KERNEL2x1_E2  16,32, 1,1
-    MY_ALIGN
-
-
-CGEMM_L2x1_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L2x1_SUB2_1
-    LOAD2x1_2
-    KERNEL2x1_E2  16,32, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L2x1_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L2x1_SAVE 
-    KERNEL2x1
-
-    MY_ALIGN
-CGEMM_L2x1_SAVE:
-/*----------------------------------------*/  
-     
-    SAVE2x1
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
-#endif   
-
-
-CGEMM_L2x1_END:
-/*----------------------------------------*/   
-    slwi    T1, K,  4
-
-    add   B,  B,  T1
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    addi TEMP_REG, TEMP_REG, 2
-#endif   
-
-CGEMM_L2_END:
-
-
-b CGEMM_L1
-/*                MINI SUBROUTINES                            */      
-/*                1x8 MAIN 128x+2 LOOP                     */      
-
-
-CGEMM_L1x8_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x8_2 
-    MY_ALIGN
-CGEMM_L1x8_LOOP:
-/*----------------------------------------*/   
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 128,16,0,0 
-CGEMM_L1x8_K128:
-/*----------------------------------------*/   
-    KERNEL1x8_L2 128,16,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 128,16,2,0
-    KERNEL1x8_L2 128,16,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 128,16,4,0
-    KERNEL1x8_L2 128,16,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 128,16,6,0
-    KERNEL1x8_L2 128,16,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 128,16,8,0
-    KERNEL1x8_L2 128,16,9,0
-    KERNEL1x8_L2 128,16,10,0
-    KERNEL1x8_L2 128,16,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 128,16,12,0
-    KERNEL1x8_L2 128,16,13,0
-    KERNEL1x8_L2 128,16,14,0
-    KERNEL1x8_L2 128,16,15,0  
-    KERNEL1x8_L2 128,16,16,0
-    KERNEL1x8_L2 128,16,17,0 
-    KERNEL1x8_L2 128,16,18,0
-    KERNEL1x8_L2 128,16,19,0  
-    KERNEL1x8_L2 128,16,20,0
-    KERNEL1x8_L2 128,16,21,0 
-    KERNEL1x8_L2 128,16,22,0
-    KERNEL1x8_L2 128,16,23,0   
-    KERNEL1x8_L2 128,16,24,0
-    KERNEL1x8_L2 128,16,25,0
-    KERNEL1x8_L2 128,16,26,0
-    KERNEL1x8_L2 128,16,27,0  
-    KERNEL1x8_L2 128,16,28,0
-    KERNEL1x8_L2 128,16,29,0
-    KERNEL1x8_L2 128,16,30,0
-    KERNEL1x8_L2 128,16,31,0 
-    KERNEL1x8_L2 128,16,32,0
-    KERNEL1x8_L2 128,16,33,0
-    KERNEL1x8_L2 128,16,34,0
-    KERNEL1x8_L2 128,16,35,0 
-    KERNEL1x8_L2 128,16,36,0
-    KERNEL1x8_L2 128,16,37,0
-    KERNEL1x8_L2 128,16,38,0
-    KERNEL1x8_L2 128,16,39,0  
-    KERNEL1x8_L2 128,16,40,0
-    KERNEL1x8_L2 128,16,41,0
-    KERNEL1x8_L2 128,16,42,0
-    KERNEL1x8_L2 128,16,43,0  
-    KERNEL1x8_L2 128,16,44,0
-    KERNEL1x8_L2 128,16,45,0
-    KERNEL1x8_L2 128,16,46,0
-    KERNEL1x8_L2 128,16,47,0 
-    KERNEL1x8_L2 128,16,48,0
-    KERNEL1x8_L2 128,16,49,0 
-    KERNEL1x8_L2 128,16,50,0
-    KERNEL1x8_L2 128,16,51,0  
-    KERNEL1x8_L2 128,16,52,0
-    KERNEL1x8_L2 128,16,53,0 
-    KERNEL1x8_L2 128,16,54,0
-    KERNEL1x8_L2 128,16,55,0  
-    KERNEL1x8_L2 128,16,56,0
-    KERNEL1x8_L2 128,16,57,0
-    KERNEL1x8_L2 128,16,58,0
-    KERNEL1x8_L2 128,16,59,0  
-    KERNEL1x8_L2 128,16,60,0
-    KERNEL1x8_L2 128,16,61,0
-    KERNEL1x8_L2 128,16,62,0 
-    KERNEL1x8_L2 128,16,63,1  
-    bdnz    CGEMM_L1x8_LOOP
-    MY_ALIGN  
-CGEMM_L1x8_LOOP_END:
-/*----------------------------------------*/   
-    END1x8_2
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x8_L64_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 128,16,0,0 
-    KERNEL1x8_L2 128,16,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 128,16,2,0
-    KERNEL1x8_L2 128,16,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 128,16,4,0
-    KERNEL1x8_L2 128,16,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 128,16,6,0
-    KERNEL1x8_L2 128,16,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 128,16,8,0
-    KERNEL1x8_L2 128,16,9,0
-    KERNEL1x8_L2 128,16,10,0
-    KERNEL1x8_L2 128,16,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 128,16,12,0
-    KERNEL1x8_L2 128,16,13,0
-    KERNEL1x8_L2 128,16,14,0
-    KERNEL1x8_L2 128,16,15,0  
-    KERNEL1x8_L2 128,16,16,0
-    KERNEL1x8_L2 128,16,17,0 
-    KERNEL1x8_L2 128,16,18,0
-    KERNEL1x8_L2 128,16,19,0  
-    KERNEL1x8_L2 128,16,20,0
-    KERNEL1x8_L2 128,16,21,0 
-    KERNEL1x8_L2 128,16,22,0
-    KERNEL1x8_L2 128,16,23,0   
-    KERNEL1x8_L2 128,16,24,0
-    KERNEL1x8_L2 128,16,25,0
-    KERNEL1x8_L2 128,16,26,0
-    KERNEL1x8_L2 128,16,27,0  
-    KERNEL1x8_L2 128,16,28,0
-    KERNEL1x8_L2 128,16,29,0
-    KERNEL1x8_L2 128,16,30,0
-    KERNEL1x8_E2 128,16,31,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x8_L32_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 128,16,0,0 
-    KERNEL1x8_L2 128,16,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 128,16,2,0
-    KERNEL1x8_L2 128,16,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 128,16,4,0
-    KERNEL1x8_L2 128,16,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 128,16,6,0
-    KERNEL1x8_L2 128,16,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 128,16,8,0
-    KERNEL1x8_L2 128,16,9,0
-    KERNEL1x8_L2 128,16,10,0
-    KERNEL1x8_L2 128,16,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 128,16,12,0
-    KERNEL1x8_L2 128,16,13,0
-    KERNEL1x8_L2 128,16,14,0
-    KERNEL1x8_E2 128,16,15,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x8_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2 
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 128,16,0,0 
-    KERNEL1x8_L2 128,16,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 128,16,2,0
-    KERNEL1x8_L2 128,16,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 128,16,4,0
-    KERNEL1x8_L2 128,16,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 128,16,6,0
-    KERNEL1x8_E2 128,16,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x4_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x4_2  
-    MY_ALIGN
-CGEMM_L1x4_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x4_L2 64,16,0,0
-CGEMM_L1x4_K32:
-/*----------------------------------------*/   
-    KERNEL1x4_L2 64,16,1,0   
-    KERNEL1x4_L2 64,16,2,0
-    KERNEL1x4_L2 64,16,3,0  
-    KERNEL1x4_L2 64,16,4,0
-    KERNEL1x4_L2 64,16,5,0 
-    KERNEL1x4_L2 64,16,6,0
-    KERNEL1x4_L2 64,16,7,0
-    KERNEL1x4_L2 64,16,8,0
-    KERNEL1x4_L2 64,16,9,0   
-    KERNEL1x4_L2 64,16,10,0
-    KERNEL1x4_L2 64,16,11,0  
-    KERNEL1x4_L2 64,16,12,0
-    KERNEL1x4_L2 64,16,13,0 
-    KERNEL1x4_L2 64,16,14,0
-    KERNEL1x4_L2 64,16,15,1    
-    bdnz    CGEMM_L1x4_LOOP
-    MY_ALIGN  
-CGEMM_L1x4_LOOP_END:
-/*----------------------------------------*/   
-    END1x4_2 
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x4_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x4_2
-    KERNEL1x4_L2 64,16,0,0
-    KERNEL1x4_L2 64,16,1,0   
-    KERNEL1x4_L2 64,16,2,0
-    KERNEL1x4_L2 64,16,3,0  
-    KERNEL1x4_L2 64,16,4,0
-    KERNEL1x4_L2 64,16,5,0 
-    KERNEL1x4_L2 64,16,6,0
-    KERNEL1x4_E2 64,16,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x4_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x4_2
-    KERNEL1x4_L2 64,16,0,0
-    KERNEL1x4_L2 64,16,1,0   
-    KERNEL1x4_L2 64,16,2,0
-    KERNEL1x4_E2 64,16,3,1 
-    blr
-
-
-CGEMM_1x2_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x2_2  
-    MY_ALIGN 
-CGEMM_L1x2_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x2_L2 32,16,0,0 
-CGEMM_L1x2_K32:
-/*----------------------------------------*/   
-    KERNEL1x2_L2 32,16,1,0  
-    KERNEL1x2_L2 32,16,2,0
-    KERNEL1x2_L2 32,16,3,0  
-    KERNEL1x2_L2 32,16,4,0
-    KERNEL1x2_L2 32,16,5,0 
-    KERNEL1x2_L2 32,16,6,0
-    KERNEL1x2_L2 32,16,7,0
-    KERNEL1x2_L2 32,16,8,0
-    KERNEL1x2_L2 32,16,9,0  
-    KERNEL1x2_L2 32,16,10,0
-    KERNEL1x2_L2 32,16,11,0  
-    KERNEL1x2_L2 32,16,12,0
-    KERNEL1x2_L2 32,16,13,0 
-    KERNEL1x2_L2 32,16,14,0
-    KERNEL1x2_L2 32,16,15,1   
-    bdnz    CGEMM_L1x2_LOOP
-    MY_ALIGN  
-
-
-CGEMM_L1x2_LOOP_END:
-/*----------------------------------------*/   
-    END1x2_2 
-    blr
-    MY_ALIGN
-CGEMM_1x2_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x2_2
-    KERNEL1x2_L2 32,16,0,0
-    KERNEL1x2_L2 32,16,1,0  
-    KERNEL1x2_L2 32,16,2,0
-    KERNEL1x2_L2 32,16,3,0  
-    KERNEL1x2_L2 32,16,4,0
-    KERNEL1x2_L2 32,16,5,0 
-    KERNEL1x2_L2 32,16,6,0
-    KERNEL1x2_E2 32,16,7,1
-    blr
-    MY_ALIGN
-CGEMM_1x2_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x2_2
-    KERNEL1x2_L2 32,16,0,0
-    KERNEL1x2_L2 32,16,1,0  
-    KERNEL1x2_L2 32,16,2,0
-    KERNEL1x2_E2 32,16,3,1  
-    blr
-
-
-CGEMM_1x1_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x1_2  
-    MY_ALIGN
-CGEMM_L1x1_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x1_L2 16,16,0,0 
-CGEMM_L1x1_K32:
-/*----------------------------------------*/   
-    KERNEL1x1_L2 16,16,1,0  
-    KERNEL1x1_L2 16,16,2,0
-    KERNEL1x1_L2 16,16,3,0  
-    KERNEL1x1_L2 16,16,4,0
-    KERNEL1x1_L2 16,16,5,0 
-    KERNEL1x1_L2 16,16,6,0
-    KERNEL1x1_L2 16,16,7,0
-    KERNEL1x1_L2 16,16,8,0
-    KERNEL1x1_L2 16,16,9,0  
-    KERNEL1x1_L2 16,16,10,0
-    KERNEL1x1_L2 16,16,11,0  
-    KERNEL1x1_L2 16,16,12,0
-    KERNEL1x1_L2 16,16,13,0 
-    KERNEL1x1_L2 16,16,14,0
-    KERNEL1x1_L2 16,16,15,1   
-    bdnz    CGEMM_L1x1_LOOP
-    MY_ALIGN  
-CGEMM_L1x1_LOOP_END:
-/*----------------------------------------*/   
-    END1x1_2 
-    blr
-
-    MY_ALIGN
-CGEMM_1x1_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x1_2
-    KERNEL1x1_L2 16,16,0,0
-    KERNEL1x1_L2 16,16,1,0  
-    KERNEL1x1_L2 16,16,2,0
-    KERNEL1x1_L2 16,16,3,0  
-    KERNEL1x1_L2 16,16,4,0
-    KERNEL1x1_L2 16,16,5,0 
-    KERNEL1x1_L2 16,16,6,0
-    KERNEL1x1_E2 16,16,7,1
-    blr
-    MY_ALIGN
-
-
-CGEMM_1x1_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x1_2
-    KERNEL1x1_L2 16,16,0,0
-    KERNEL1x1_L2 16,16,1,0  
-    KERNEL1x1_L2 16,16,2,0
-    KERNEL1x1_E2 16,16,3,1  
-    blr
-
-
-
-/*             MAIN LOOP BEGINS               */   
-    MY_ALIGN
-
-
-CGEMM_L1:
-/*----------------------------------------*/   
-
-    andi.    J,  N,  1
-    ble   CGEMM_L1_END
-
-CGEMM_L1_BEGIN:
-/*----------------------------------------*/   
-    mr    CO, C  
-    add     T2,C,LDC    
-    mr    AO, A  
-    add   C,  C,  T1
-#if defined(TRMMKERNEL) && defined(LEFT)   
-    mr TEMP_REG, OFFSET  /*off = offset;*/
-#endif     
-    srawi.    I,  M,  3
-    ble   CGEMM_L1x8_END
-    dcbt    CO,r0  /*just prefetch*/
-    dcbt    T2,r0    
-
-
-CGEMM_L1x8_BEGIN:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
-#else    
-    mr    BO, B  
-    dcbt    B,  r0  
-#endif     
-    dcbt    AO, r0
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
-    mr T1, T6
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512   
-    srawi.   T8, T1, 7 /**(T1-2) % 128x */
-#else   
-    mr T1, K
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512 
-    srawi.   T8, T1, 7 /**(K-2) % 128x */
-#endif   
-    ZERO1x8  
-    ble   CGEMM_L1x8_SUB0
-    bl CGEMM_L1x8_LMAIN_SUB
-    andi.   L,  T1, 127
-    ble   CGEMM_L1x8_SAVE
-    b   CGEMM_L1x8_SUB2
-
-
-CGEMM_L1x8_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 255
-    cmpwi   T6,129
-#else   
-    andi.   L,  K,  255
-    cmpwi   K,129
-#endif       
-    li T8,1
-    bne CMP1x8_128K
-    addi BO,BO,-8
-    addi AO,AO,-64 
-    LOAD1x8O 64,8 
-    END1x8_WITHOUT_ADD   
-    LOAD1x8_2O  128, 16 
-    mtctr   T8    
-    bl CGEMM_L1x8_K128   
-    b CGEMM_L1x8_SAVE  
-    CMP1x8_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne CGEMM_L1x8_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-16
-    addi AO,AO,-128   
-    LOAD1x8_2O 128,16
-    bl CGEMM_L1x8_K128   
-    b CGEMM_L1x8_SAVE 
-    MY_ALIGN
-
-
-CGEMM_L1x8_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 64
-    ble CGEMM_L1x8_SUB2_32
-    bl  CGEMM_1x8_L64_SUB
-    MY_ALIGN
-
-
-CGEMM_L1x8_SUB2_32:
-/*----------------------------------------*/   
-    andi.      T1,L, 32
-    ble CGEMM_L1x8_SUB2_16    
-    bl  CGEMM_1x8_L32_SUB
-    MY_ALIGN 
-
-
-CGEMM_L1x8_SUB2_16:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L1x8_SUB2_8
-    bl  CGEMM_1x8_L16_SUB  
-    MY_ALIGN    
-
-
-CGEMM_L1x8_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L1x8_SUB2_4
-    LOAD1x8_2
-    KERNEL1x8_L2  128,16, 0,0
-    KERNEL1x8_L2  128,16, 1,0
-    KERNEL1x8_L2  128,16, 2,0
-    KERNEL1x8_E2  128,16, 3,1
-    MY_ALIGN   
-
-
-CGEMM_L1x8_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L1x8_SUB2_2
-    LOAD1x8_2
-    KERNEL1x8_L2  128,16, 0,0
-    KERNEL1x8_E2  128,16, 1,1
-    MY_ALIGN
-
-
-CGEMM_L1x8_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L1x8_SUB2_1
-    LOAD1x8_2 
-    KERNEL1x8_E2  128,16, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L1x8_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L1x8_SAVE 
-    KERNEL1x8
-
-    MY_ALIGN
-CGEMM_L1x8_SAVE:
-/*----------------------------------------*/   
-    addic.    I,  I,  -1
-    MY_ALIGN
-    SAVE1x8
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
-#endif     
-    bgt   CGEMM_L1x8_BEGIN
-    andi.   T2, M,  7
-    ble   CGEMM_L1x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L1x4_END
-    b   CGEMM_L1x4_BEGIN
-    MY_ALIGN 
-
-
-CGEMM_L1x8_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L1x4_BEGIN:
-/*----------------------------------------*/   
-    andi.   T2, M,  7
-    ble   CGEMM_L1x1_END
-    andi.   T1, M,  4
-    ble   CGEMM_L1x4_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 31x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 31x */
-#endif     
-    ZERO1x4
-    ble   CGEMM_L1x4_SUB0 
-    bl CGEMM_1x4_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L1x4_SAVE
-    b    CGEMM_L1x4_SUB2
-
-
-CGEMM_L1x4_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x4_32K
-    addi BO,BO,-8
-    addi AO,AO,-32  
-    LOAD1x4O 32,8 
-    END1x4_WITHOUT_ADD   
-    LOAD1x4_2O  64, 16 
-    mtctr   T8    
-    bl CGEMM_L1x4_K32   
-    b CGEMM_L1x4_SAVE  
-    CMP1x4_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L1x4_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-16
-    addi AO,AO,-64   
-    LOAD1x4_2O 64,16
-    bl CGEMM_L1x4_K32   
-    b CGEMM_L1x4_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L1x4_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L1x4_SUB2_8
-    bl  CGEMM_1x4_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L1x4_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L1x4_SUB2_4
-    bl CGEMM_1x4_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L1x4_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L1x4_SUB2_2
-    LOAD1x4_2
-    KERNEL1x4_L2  64,16, 0,0
-    KERNEL1x4_E2  64,16, 1,1
-    MY_ALIGN
-
-
-CGEMM_L1x4_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L1x4_SUB2_1
-    LOAD1x4_2
-    KERNEL1x4_E2  64,16, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L1x4_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L1x4_SAVE 
-    KERNEL1x4
-
-
-CGEMM_L1x4_SAVE:
-/*----------------------------------------*/   
-    SAVE1x4
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
-#endif     
-
-
-CGEMM_L1x4_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L1x2_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  2
-    ble   CGEMM_L1x2_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 31x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 31x */
-#endif     
-    ZERO1x2
-    ble   CGEMM_L1x2_SUB0 
-    bl CGEMM_1x2_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L1x2_SAVE
-    b   CGEMM_L1x2_SUB2
-
-
-CGEMM_L1x2_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x2_32K
-    addi BO,BO,-8
-    addi AO,AO,-16  
-    LOAD1x2O 16,8 
-    END1x2_WITHOUT_ADD   
-    LOAD1x2_2O  32, 16  
-    mtctr   T8    
-    bl CGEMM_L1x2_K32   
-    b CGEMM_L1x2_SAVE  
-    CMP1x2_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L1x2_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-16
-    addi AO,AO,-32   
-    LOAD1x2_2O 32,16
-    bl CGEMM_L1x2_K32   
-    b CGEMM_L1x2_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L1x2_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L1x2_SUB2_8
-    bl CGEMM_1x2_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L1x2_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L1x2_SUB2_4
-    bl CGEMM_1x2_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L1x2_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L1x2_SUB2_2
-    LOAD1x2_2
-    KERNEL1x2_L2  32,16, 0,0
-    KERNEL1x2_E2  32,16, 1,1
-    MY_ALIGN
-
-
-CGEMM_L1x2_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L1x2_SUB2_1
-    LOAD1x2_2
-    KERNEL1x2_E2  32,16, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L1x2_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L1x2_SAVE 
-    KERNEL1x2
-
-    MY_ALIGN
-CGEMM_L1x2_SAVE:
-/*----------------------------------------*/   
-    SAVE1x2
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
-#endif     
-
-
-CGEMM_L1x2_END:
-/*----------------------------------------*/   
-
-
-CGEMM_L1x1_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  1
-    ble   CGEMM_L1x1_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T1-2) % 31x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 31x */
-#endif     
-    ZERO1x1
-    ble   CGEMM_L1x1_SUB0 
-    bl CGEMM_1x1_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   CGEMM_L1x1_SAVE
-    b   CGEMM_L1x1_SUB2
-
-
-CGEMM_L1x1_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x1_32K
-    addi BO,BO,-8
-    addi AO,AO,-8  
-    LOAD1x1O 8,8 
-    END1x1_WITHOUT_ADD   
-    LOAD1x1_2O  16, 16  
-    mtctr   T8    
-    bl CGEMM_L1x1_K32   
-    b CGEMM_L1x1_SAVE  
-    CMP1x1_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne CGEMM_L1x1_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-16
-    addi AO,AO,-16   
-    LOAD1x1_2O 16,16
-    bl CGEMM_L1x1_K32   
-    b CGEMM_L1x1_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-CGEMM_L1x1_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble CGEMM_L1x1_SUB2_8
-    bl CGEMM_1x1_L16_SUB  
-    MY_ALIGN
-
-
-CGEMM_L1x1_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble CGEMM_L1x1_SUB2_4
-    bl CGEMM_1x1_L8_SUB
-    MY_ALIGN  
-
-
-CGEMM_L1x1_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble CGEMM_L1x1_SUB2_2
-    LOAD1x1_2
-    KERNEL1x1_L2  16,16, 0,0
-    KERNEL1x1_E2  16,16, 1,1
-    MY_ALIGN
-
-
-CGEMM_L1x1_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble CGEMM_L1x1_SUB2_1
-    LOAD1x1_2
-    KERNEL1x1_E2  16,16, 0,1
-    MY_ALIGN    
-
-
-CGEMM_L1x1_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble CGEMM_L1x1_SAVE 
-    KERNEL1x1
-
-    MY_ALIGN
-CGEMM_L1x1_SAVE:
-/*----------------------------------------*/  
-     
-    SAVE1x1
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
-#endif   
-
-
-CGEMM_L1x1_END:
-/*----------------------------------------*/   
-    slwi    T1, K,  3
-
-    add   B,  B,  T1
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    addi TEMP_REG, TEMP_REG, 1
-#endif   
-
-CGEMM_L1_END:
-
-
-
-
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+#define MY_ALIGN .align 3
+b CGEMM_L4
+/*                MINI SUBROUTINES                            */      
+/*                4x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L4x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x8_2 
+    MY_ALIGN
+CGEMM_L4x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+CGEMM_L4x8_K128:
+/*----------------------------------------*/   
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_L2 128,64,31,0 
+    KERNEL4x8_L2 128,64,32,0
+    KERNEL4x8_L2 128,64,33,0
+    KERNEL4x8_L2 128,64,34,0
+    KERNEL4x8_L2 128,64,35,0 
+    KERNEL4x8_L2 128,64,36,0
+    KERNEL4x8_L2 128,64,37,0
+    KERNEL4x8_L2 128,64,38,0
+    KERNEL4x8_L2 128,64,39,0  
+    KERNEL4x8_L2 128,64,40,0
+    KERNEL4x8_L2 128,64,41,0
+    KERNEL4x8_L2 128,64,42,0
+    KERNEL4x8_L2 128,64,43,0  
+    KERNEL4x8_L2 128,64,44,0
+    KERNEL4x8_L2 128,64,45,0
+    KERNEL4x8_L2 128,64,46,0
+    KERNEL4x8_L2 128,64,47,0 
+    KERNEL4x8_L2 128,64,48,0
+    KERNEL4x8_L2 128,64,49,0 
+    KERNEL4x8_L2 128,64,50,0
+    KERNEL4x8_L2 128,64,51,0  
+    KERNEL4x8_L2 128,64,52,0
+    KERNEL4x8_L2 128,64,53,0 
+    KERNEL4x8_L2 128,64,54,0
+    KERNEL4x8_L2 128,64,55,0  
+    KERNEL4x8_L2 128,64,56,0
+    KERNEL4x8_L2 128,64,57,0
+    KERNEL4x8_L2 128,64,58,0
+    KERNEL4x8_L2 128,64,59,0  
+    KERNEL4x8_L2 128,64,60,0
+    KERNEL4x8_L2 128,64,61,0
+    KERNEL4x8_L2 128,64,62,0 
+    KERNEL4x8_L2 128,64,63,1  
+    bdnz    CGEMM_L4x8_LOOP
+    MY_ALIGN  
+CGEMM_L4x8_LOOP_END:
+/*----------------------------------------*/   
+    END4x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_L2 128,64,15,0  
+    KERNEL4x8_L2 128,64,16,0
+    KERNEL4x8_L2 128,64,17,0 
+    KERNEL4x8_L2 128,64,18,0
+    KERNEL4x8_L2 128,64,19,0  
+    KERNEL4x8_L2 128,64,20,0
+    KERNEL4x8_L2 128,64,21,0 
+    KERNEL4x8_L2 128,64,22,0
+    KERNEL4x8_L2 128,64,23,0   
+    KERNEL4x8_L2 128,64,24,0
+    KERNEL4x8_L2 128,64,25,0
+    KERNEL4x8_L2 128,64,26,0
+    KERNEL4x8_L2 128,64,27,0  
+    KERNEL4x8_L2 128,64,28,0
+    KERNEL4x8_L2 128,64,29,0
+    KERNEL4x8_L2 128,64,30,0
+    KERNEL4x8_E2 128,64,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_L2 128,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL4x8_L2 128,64,8,0
+    KERNEL4x8_L2 128,64,9,0
+    KERNEL4x8_L2 128,64,10,0
+    KERNEL4x8_L2 128,64,11,0  
+    dcbt    BO, T4
+    KERNEL4x8_L2 128,64,12,0
+    KERNEL4x8_L2 128,64,13,0
+    KERNEL4x8_L2 128,64,14,0
+    KERNEL4x8_E2 128,64,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL4x8_L2 128,64,0,0 
+    KERNEL4x8_L2 128,64,1,0
+    dcbt    AO, T2  
+    KERNEL4x8_L2 128,64,2,0
+    KERNEL4x8_L2 128,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL4x8_L2 128,64,4,0
+    KERNEL4x8_L2 128,64,5,0
+    dcbt    AO, T4  
+    KERNEL4x8_L2 128,64,6,0
+    KERNEL4x8_E2 128,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x4_2  
+    MY_ALIGN
+CGEMM_L4x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,0,0
+CGEMM_L4x4_K32:
+/*----------------------------------------*/   
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_L2 64,64,7,0
+    KERNEL4x4_L2 64,64,8,0
+    KERNEL4x4_L2 64,64,9,0   
+    KERNEL4x4_L2 64,64,10,0
+    KERNEL4x4_L2 64,64,11,0  
+    KERNEL4x4_L2 64,64,12,0
+    KERNEL4x4_L2 64,64,13,0 
+    KERNEL4x4_L2 64,64,14,0
+    KERNEL4x4_L2 64,64,15,1    
+    bdnz    CGEMM_L4x4_LOOP
+    MY_ALIGN  
+CGEMM_L4x4_LOOP_END:
+/*----------------------------------------*/   
+    END4x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_L2 64,64,3,0  
+    KERNEL4x4_L2 64,64,4,0
+    KERNEL4x4_L2 64,64,5,0 
+    KERNEL4x4_L2 64,64,6,0
+    KERNEL4x4_E2 64,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x4_2
+    KERNEL4x4_L2 64,64,0,0
+    KERNEL4x4_L2 64,64,1,0   
+    KERNEL4x4_L2 64,64,2,0
+    KERNEL4x4_E2 64,64,3,1 
+    blr
+
+
+CGEMM_4x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x2_2  
+    MY_ALIGN 
+CGEMM_L4x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,0,0 
+CGEMM_L4x2_K32:
+/*----------------------------------------*/   
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_L2 32,64,7,0
+    KERNEL4x2_L2 32,64,8,0
+    KERNEL4x2_L2 32,64,9,0  
+    KERNEL4x2_L2 32,64,10,0
+    KERNEL4x2_L2 32,64,11,0  
+    KERNEL4x2_L2 32,64,12,0
+    KERNEL4x2_L2 32,64,13,0 
+    KERNEL4x2_L2 32,64,14,0
+    KERNEL4x2_L2 32,64,15,1   
+    bdnz    CGEMM_L4x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L4x2_LOOP_END:
+/*----------------------------------------*/   
+    END4x2_2 
+    blr
+    MY_ALIGN
+CGEMM_4x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_L2 32,64,3,0  
+    KERNEL4x2_L2 32,64,4,0
+    KERNEL4x2_L2 32,64,5,0 
+    KERNEL4x2_L2 32,64,6,0
+    KERNEL4x2_E2 32,64,7,1
+    blr
+    MY_ALIGN
+CGEMM_4x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x2_2
+    KERNEL4x2_L2 32,64,0,0
+    KERNEL4x2_L2 32,64,1,0  
+    KERNEL4x2_L2 32,64,2,0
+    KERNEL4x2_E2 32,64,3,1  
+    blr
+
+
+CGEMM_4x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD4x1_2  
+    MY_ALIGN
+CGEMM_L4x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,0,0 
+CGEMM_L4x1_K32:
+/*----------------------------------------*/   
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_L2 16,64,7,0
+    KERNEL4x1_L2 16,64,8,0
+    KERNEL4x1_L2 16,64,9,0  
+    KERNEL4x1_L2 16,64,10,0
+    KERNEL4x1_L2 16,64,11,0  
+    KERNEL4x1_L2 16,64,12,0
+    KERNEL4x1_L2 16,64,13,0 
+    KERNEL4x1_L2 16,64,14,0
+    KERNEL4x1_L2 16,64,15,1   
+    bdnz    CGEMM_L4x1_LOOP
+    MY_ALIGN  
+CGEMM_L4x1_LOOP_END:
+/*----------------------------------------*/   
+    END4x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_4x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_L2 16,64,3,0  
+    KERNEL4x1_L2 16,64,4,0
+    KERNEL4x1_L2 16,64,5,0 
+    KERNEL4x1_L2 16,64,6,0
+    KERNEL4x1_E2 16,64,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_4x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD4x1_2
+    KERNEL4x1_L2 16,64,0,0
+    KERNEL4x1_L2 16,64,1,0  
+    KERNEL4x1_L2 16,64,2,0
+    KERNEL4x1_E2 16,64,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L4:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    neg TEMP_REG, OFFSET 
+#endif   
+    srawi.    J,  N,  2
+    ble   CGEMM_L4_END
+
+
+CGEMM_L4_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 2     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L4x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L4x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,4
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO4x8  
+    ble   CGEMM_L4x8_SUB0
+    bl CGEMM_L4x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L4x8_SAVE
+    b   CGEMM_L4x8_SUB2
+
+
+CGEMM_L4x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP4x8_128K
+    addi BO,BO,-32
+    addi AO,AO,-64 
+    LOAD4x8O 64,32 
+    END4x8_WITHOUT_ADD   
+    LOAD4x8_2O  128, 64 
+    mtctr   T8    
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE  
+    CMP4x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L4x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD4x8_2O 128,64
+    bl CGEMM_L4x8_K128   
+    b CGEMM_L4x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L4x8_SUB2_32
+    bl  CGEMM_4x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L4x8_SUB2_16    
+    bl  CGEMM_4x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L4x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x8_SUB2_8
+    bl  CGEMM_4x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x8_SUB2_4
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_L2  128,64, 1,0
+    KERNEL4x8_L2  128,64, 2,0
+    KERNEL4x8_E2  128,64, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L4x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x8_SUB2_2
+    LOAD4x8_2
+    KERNEL4x8_L2  128,64, 0,0
+    KERNEL4x8_E2  128,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x8_SUB2_1
+    LOAD4x8_2 
+    KERNEL4x8_E2  128,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x8_SAVE 
+    KERNEL4x8
+
+    MY_ALIGN
+CGEMM_L4x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE4x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4
+#endif     
+    bgt   CGEMM_L4x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+    b   CGEMM_L4x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L4x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L4x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L4x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x4
+    ble   CGEMM_L4x4_SUB0 
+    bl CGEMM_4x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x4_SAVE
+    b    CGEMM_L4x4_SUB2
+
+
+CGEMM_L4x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x4_32K
+    addi BO,BO,-32
+    addi AO,AO,-32  
+    LOAD4x4O 32,32 
+    END4x4_WITHOUT_ADD   
+    LOAD4x4_2O  64, 64 
+    mtctr   T8    
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE  
+    CMP4x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-64   
+    LOAD4x4_2O 64,64
+    bl CGEMM_L4x4_K32   
+    b CGEMM_L4x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x4_SUB2_8
+    bl  CGEMM_4x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x4_SUB2_4
+    bl CGEMM_4x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x4_SUB2_2
+    LOAD4x4_2
+    KERNEL4x4_L2  64,64, 0,0
+    KERNEL4x4_E2  64,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x4_SUB2_1
+    LOAD4x4_2
+    KERNEL4x4_E2  64,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x4_SAVE 
+    KERNEL4x4
+
+
+CGEMM_L4x4_SAVE:
+/*----------------------------------------*/   
+    SAVE4x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4
+#endif     
+
+
+CGEMM_L4x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L4x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x2
+    ble   CGEMM_L4x2_SUB0 
+    bl CGEMM_4x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x2_SAVE
+    b   CGEMM_L4x2_SUB2
+
+
+CGEMM_L4x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x2_32K
+    addi BO,BO,-32
+    addi AO,AO,-16  
+    LOAD4x2O 16,32 
+    END4x2_WITHOUT_ADD   
+    LOAD4x2_2O  32, 64  
+    mtctr   T8    
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE  
+    CMP4x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-32   
+    LOAD4x2_2O 32,64
+    bl CGEMM_L4x2_K32   
+    b CGEMM_L4x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x2_SUB2_8
+    bl CGEMM_4x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x2_SUB2_4
+    bl CGEMM_4x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x2_SUB2_2
+    LOAD4x2_2
+    KERNEL4x2_L2  32,64, 0,0
+    KERNEL4x2_E2  32,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x2_SUB2_1
+    LOAD4x2_2
+    KERNEL4x2_E2  32,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x2_SAVE 
+    KERNEL4x2
+
+    MY_ALIGN
+CGEMM_L4x2_SAVE:
+/*----------------------------------------*/   
+    SAVE4x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4
+#endif     
+
+
+CGEMM_L4x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L4x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L4x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,4
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO4x1
+    ble   CGEMM_L4x1_SUB0 
+    bl CGEMM_4x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L4x1_SAVE
+    b   CGEMM_L4x1_SUB2
+
+
+CGEMM_L4x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP4x1_32K
+    addi BO,BO,-32
+    addi AO,AO,-8  
+    LOAD4x1O 8,32 
+    END4x1_WITHOUT_ADD   
+    LOAD4x1_2O  16, 64  
+    mtctr   T8    
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE  
+    CMP4x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L4x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-16   
+    LOAD4x1_2O 16,64
+    bl CGEMM_L4x1_K32   
+    b CGEMM_L4x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L4x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L4x1_SUB2_8
+    bl CGEMM_4x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L4x1_SUB2_4
+    bl CGEMM_4x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L4x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L4x1_SUB2_2
+    LOAD4x1_2
+    KERNEL4x1_L2  16,64, 0,0
+    KERNEL4x1_E2  16,64, 1,1
+    MY_ALIGN
+
+
+CGEMM_L4x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L4x1_SUB2_1
+    LOAD4x1_2
+    KERNEL4x1_E2  16,64, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L4x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L4x1_SAVE 
+    KERNEL4x1
+
+    MY_ALIGN
+CGEMM_L4x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE4x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4
+#endif   
+
+
+CGEMM_L4x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  5
+    addic.    J,  J,  -1
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 4
+#endif   
+    bgt   CGEMM_L4_BEGIN
+
+
+CGEMM_L4_END:
+
+b CGEMM_L2
+/*                MINI SUBROUTINES                            */      
+/*                2x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x8_2 
+    MY_ALIGN
+CGEMM_L2x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+CGEMM_L2x8_K128:
+/*----------------------------------------*/   
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_L2 128,32,31,0 
+    KERNEL2x8_L2 128,32,32,0
+    KERNEL2x8_L2 128,32,33,0
+    KERNEL2x8_L2 128,32,34,0
+    KERNEL2x8_L2 128,32,35,0 
+    KERNEL2x8_L2 128,32,36,0
+    KERNEL2x8_L2 128,32,37,0
+    KERNEL2x8_L2 128,32,38,0
+    KERNEL2x8_L2 128,32,39,0  
+    KERNEL2x8_L2 128,32,40,0
+    KERNEL2x8_L2 128,32,41,0
+    KERNEL2x8_L2 128,32,42,0
+    KERNEL2x8_L2 128,32,43,0  
+    KERNEL2x8_L2 128,32,44,0
+    KERNEL2x8_L2 128,32,45,0
+    KERNEL2x8_L2 128,32,46,0
+    KERNEL2x8_L2 128,32,47,0 
+    KERNEL2x8_L2 128,32,48,0
+    KERNEL2x8_L2 128,32,49,0 
+    KERNEL2x8_L2 128,32,50,0
+    KERNEL2x8_L2 128,32,51,0  
+    KERNEL2x8_L2 128,32,52,0
+    KERNEL2x8_L2 128,32,53,0 
+    KERNEL2x8_L2 128,32,54,0
+    KERNEL2x8_L2 128,32,55,0  
+    KERNEL2x8_L2 128,32,56,0
+    KERNEL2x8_L2 128,32,57,0
+    KERNEL2x8_L2 128,32,58,0
+    KERNEL2x8_L2 128,32,59,0  
+    KERNEL2x8_L2 128,32,60,0
+    KERNEL2x8_L2 128,32,61,0
+    KERNEL2x8_L2 128,32,62,0 
+    KERNEL2x8_L2 128,32,63,1  
+    bdnz    CGEMM_L2x8_LOOP
+    MY_ALIGN  
+CGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/   
+    END2x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_L2 128,32,15,0  
+    KERNEL2x8_L2 128,32,16,0
+    KERNEL2x8_L2 128,32,17,0 
+    KERNEL2x8_L2 128,32,18,0
+    KERNEL2x8_L2 128,32,19,0  
+    KERNEL2x8_L2 128,32,20,0
+    KERNEL2x8_L2 128,32,21,0 
+    KERNEL2x8_L2 128,32,22,0
+    KERNEL2x8_L2 128,32,23,0   
+    KERNEL2x8_L2 128,32,24,0
+    KERNEL2x8_L2 128,32,25,0
+    KERNEL2x8_L2 128,32,26,0
+    KERNEL2x8_L2 128,32,27,0  
+    KERNEL2x8_L2 128,32,28,0
+    KERNEL2x8_L2 128,32,29,0
+    KERNEL2x8_L2 128,32,30,0
+    KERNEL2x8_E2 128,32,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_L2 128,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 128,32,8,0
+    KERNEL2x8_L2 128,32,9,0
+    KERNEL2x8_L2 128,32,10,0
+    KERNEL2x8_L2 128,32,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 128,32,12,0
+    KERNEL2x8_L2 128,32,13,0
+    KERNEL2x8_L2 128,32,14,0
+    KERNEL2x8_E2 128,32,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 128,32,0,0 
+    KERNEL2x8_L2 128,32,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 128,32,2,0
+    KERNEL2x8_L2 128,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 128,32,4,0
+    KERNEL2x8_L2 128,32,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 128,32,6,0
+    KERNEL2x8_E2 128,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x4_2  
+    MY_ALIGN
+CGEMM_L2x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,0,0
+CGEMM_L2x4_K32:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_L2 64,32,7,0
+    KERNEL2x4_L2 64,32,8,0
+    KERNEL2x4_L2 64,32,9,0   
+    KERNEL2x4_L2 64,32,10,0
+    KERNEL2x4_L2 64,32,11,0  
+    KERNEL2x4_L2 64,32,12,0
+    KERNEL2x4_L2 64,32,13,0 
+    KERNEL2x4_L2 64,32,14,0
+    KERNEL2x4_L2 64,32,15,1    
+    bdnz    CGEMM_L2x4_LOOP
+    MY_ALIGN  
+CGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/   
+    END2x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_L2 64,32,3,0  
+    KERNEL2x4_L2 64,32,4,0
+    KERNEL2x4_L2 64,32,5,0 
+    KERNEL2x4_L2 64,32,6,0
+    KERNEL2x4_E2 64,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 64,32,0,0
+    KERNEL2x4_L2 64,32,1,0   
+    KERNEL2x4_L2 64,32,2,0
+    KERNEL2x4_E2 64,32,3,1 
+    blr
+
+
+CGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x2_2  
+    MY_ALIGN 
+CGEMM_L2x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,0,0 
+CGEMM_L2x2_K32:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_L2 32,32,7,0
+    KERNEL2x2_L2 32,32,8,0
+    KERNEL2x2_L2 32,32,9,0  
+    KERNEL2x2_L2 32,32,10,0
+    KERNEL2x2_L2 32,32,11,0  
+    KERNEL2x2_L2 32,32,12,0
+    KERNEL2x2_L2 32,32,13,0 
+    KERNEL2x2_L2 32,32,14,0
+    KERNEL2x2_L2 32,32,15,1   
+    bdnz    CGEMM_L2x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/   
+    END2x2_2 
+    blr
+    MY_ALIGN
+CGEMM_2x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_L2 32,32,3,0  
+    KERNEL2x2_L2 32,32,4,0
+    KERNEL2x2_L2 32,32,5,0 
+    KERNEL2x2_L2 32,32,6,0
+    KERNEL2x2_E2 32,32,7,1
+    blr
+    MY_ALIGN
+CGEMM_2x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 32,32,0,0
+    KERNEL2x2_L2 32,32,1,0  
+    KERNEL2x2_L2 32,32,2,0
+    KERNEL2x2_E2 32,32,3,1  
+    blr
+
+
+CGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x1_2  
+    MY_ALIGN
+CGEMM_L2x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,0,0 
+CGEMM_L2x1_K32:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_L2 16,32,7,0
+    KERNEL2x1_L2 16,32,8,0
+    KERNEL2x1_L2 16,32,9,0  
+    KERNEL2x1_L2 16,32,10,0
+    KERNEL2x1_L2 16,32,11,0  
+    KERNEL2x1_L2 16,32,12,0
+    KERNEL2x1_L2 16,32,13,0 
+    KERNEL2x1_L2 16,32,14,0
+    KERNEL2x1_L2 16,32,15,1   
+    bdnz    CGEMM_L2x1_LOOP
+    MY_ALIGN  
+CGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/   
+    END2x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_2x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_L2 16,32,3,0  
+    KERNEL2x1_L2 16,32,4,0
+    KERNEL2x1_L2 16,32,5,0 
+    KERNEL2x1_L2 16,32,6,0
+    KERNEL2x1_E2 16,32,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_2x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 16,32,0,0
+    KERNEL2x1_L2 16,32,1,0  
+    KERNEL2x1_L2 16,32,2,0
+    KERNEL2x1_E2 16,32,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L2:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  2
+    ble   CGEMM_L2_END
+
+
+CGEMM_L2_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 1     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L2x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L2x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO2x8  
+    ble   CGEMM_L2x8_SUB0
+    bl CGEMM_L2x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L2x8_SAVE
+    b   CGEMM_L2x8_SUB2
+
+
+CGEMM_L2x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP2x8_128K
+    addi BO,BO,-16
+    addi AO,AO,-64 
+    LOAD2x8O 64,16 
+    END2x8_WITHOUT_ADD   
+    LOAD2x8_2O  128, 32 
+    mtctr   T8    
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE  
+    CMP2x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L2x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-128   
+    LOAD2x8_2O 128,32
+    bl CGEMM_L2x8_K128   
+    b CGEMM_L2x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L2x8_SUB2_32
+    bl  CGEMM_2x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L2x8_SUB2_16    
+    bl  CGEMM_2x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x8_SUB2_8
+    bl  CGEMM_2x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x8_SUB2_4
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_L2  128,32, 1,0
+    KERNEL2x8_L2  128,32, 2,0
+    KERNEL2x8_E2  128,32, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x8_SUB2_2
+    LOAD2x8_2
+    KERNEL2x8_L2  128,32, 0,0
+    KERNEL2x8_E2  128,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x8_SUB2_1
+    LOAD2x8_2 
+    KERNEL2x8_E2  128,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x8_SAVE 
+    KERNEL2x8
+
+    MY_ALIGN
+CGEMM_L2x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE2x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif     
+    bgt   CGEMM_L2x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+    b   CGEMM_L2x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L2x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L2x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x4
+    ble   CGEMM_L2x4_SUB0 
+    bl CGEMM_2x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x4_SAVE
+    b    CGEMM_L2x4_SUB2
+
+
+CGEMM_L2x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x4_32K
+    addi BO,BO,-16
+    addi AO,AO,-32  
+    LOAD2x4O 32,16 
+    END2x4_WITHOUT_ADD   
+    LOAD2x4_2O  64, 32 
+    mtctr   T8    
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE  
+    CMP2x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-64   
+    LOAD2x4_2O 64,32
+    bl CGEMM_L2x4_K32   
+    b CGEMM_L2x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x4_SUB2_8
+    bl  CGEMM_2x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x4_SUB2_4
+    bl CGEMM_2x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x4_SUB2_2
+    LOAD2x4_2
+    KERNEL2x4_L2  64,32, 0,0
+    KERNEL2x4_E2  64,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x4_SUB2_1
+    LOAD2x4_2
+    KERNEL2x4_E2  64,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x4_SAVE 
+    KERNEL2x4
+
+
+CGEMM_L2x4_SAVE:
+/*----------------------------------------*/   
+    SAVE2x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif     
+
+
+CGEMM_L2x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L2x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x2
+    ble   CGEMM_L2x2_SUB0 
+    bl CGEMM_2x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x2_SAVE
+    b   CGEMM_L2x2_SUB2
+
+
+CGEMM_L2x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x2_32K
+    addi BO,BO,-16
+    addi AO,AO,-16  
+    LOAD2x2O 16,16 
+    END2x2_WITHOUT_ADD   
+    LOAD2x2_2O  32, 32  
+    mtctr   T8    
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE  
+    CMP2x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-32   
+    LOAD2x2_2O 32,32
+    bl CGEMM_L2x2_K32   
+    b CGEMM_L2x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x2_SUB2_8
+    bl CGEMM_2x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x2_SUB2_4
+    bl CGEMM_2x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x2_SUB2_2
+    LOAD2x2_2
+    KERNEL2x2_L2  32,32, 0,0
+    KERNEL2x2_E2  32,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x2_SUB2_1
+    LOAD2x2_2
+    KERNEL2x2_E2  32,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x2_SAVE 
+    KERNEL2x2
+
+    MY_ALIGN
+CGEMM_L2x2_SAVE:
+/*----------------------------------------*/   
+    SAVE2x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif     
+
+
+CGEMM_L2x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L2x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L2x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x1
+    ble   CGEMM_L2x1_SUB0 
+    bl CGEMM_2x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L2x1_SAVE
+    b   CGEMM_L2x1_SUB2
+
+
+CGEMM_L2x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x1_32K
+    addi BO,BO,-16
+    addi AO,AO,-8  
+    LOAD2x1O 8,16 
+    END2x1_WITHOUT_ADD   
+    LOAD2x1_2O  16, 32  
+    mtctr   T8    
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE  
+    CMP2x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L2x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-16   
+    LOAD2x1_2O 16,32
+    bl CGEMM_L2x1_K32   
+    b CGEMM_L2x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L2x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L2x1_SUB2_8
+    bl CGEMM_2x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L2x1_SUB2_4
+    bl CGEMM_2x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L2x1_SUB2_2
+    LOAD2x1_2
+    KERNEL2x1_L2  16,32, 0,0
+    KERNEL2x1_E2  16,32, 1,1
+    MY_ALIGN
+
+
+CGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L2x1_SUB2_1
+    LOAD2x1_2
+    KERNEL2x1_E2  16,32, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L2x1_SAVE 
+    KERNEL2x1
+
+    MY_ALIGN
+CGEMM_L2x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE2x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif   
+
+
+CGEMM_L2x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  4
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 2
+#endif   
+
+CGEMM_L2_END:
+
+
+b CGEMM_L1
+/*                MINI SUBROUTINES                            */      
+/*                1x8 MAIN 128x+2 LOOP                     */      
+
+
+CGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x8_2 
+    MY_ALIGN
+CGEMM_L1x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+CGEMM_L1x8_K128:
+/*----------------------------------------*/   
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_L2 128,16,31,0 
+    KERNEL1x8_L2 128,16,32,0
+    KERNEL1x8_L2 128,16,33,0
+    KERNEL1x8_L2 128,16,34,0
+    KERNEL1x8_L2 128,16,35,0 
+    KERNEL1x8_L2 128,16,36,0
+    KERNEL1x8_L2 128,16,37,0
+    KERNEL1x8_L2 128,16,38,0
+    KERNEL1x8_L2 128,16,39,0  
+    KERNEL1x8_L2 128,16,40,0
+    KERNEL1x8_L2 128,16,41,0
+    KERNEL1x8_L2 128,16,42,0
+    KERNEL1x8_L2 128,16,43,0  
+    KERNEL1x8_L2 128,16,44,0
+    KERNEL1x8_L2 128,16,45,0
+    KERNEL1x8_L2 128,16,46,0
+    KERNEL1x8_L2 128,16,47,0 
+    KERNEL1x8_L2 128,16,48,0
+    KERNEL1x8_L2 128,16,49,0 
+    KERNEL1x8_L2 128,16,50,0
+    KERNEL1x8_L2 128,16,51,0  
+    KERNEL1x8_L2 128,16,52,0
+    KERNEL1x8_L2 128,16,53,0 
+    KERNEL1x8_L2 128,16,54,0
+    KERNEL1x8_L2 128,16,55,0  
+    KERNEL1x8_L2 128,16,56,0
+    KERNEL1x8_L2 128,16,57,0
+    KERNEL1x8_L2 128,16,58,0
+    KERNEL1x8_L2 128,16,59,0  
+    KERNEL1x8_L2 128,16,60,0
+    KERNEL1x8_L2 128,16,61,0
+    KERNEL1x8_L2 128,16,62,0 
+    KERNEL1x8_L2 128,16,63,1  
+    bdnz    CGEMM_L1x8_LOOP
+    MY_ALIGN  
+CGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/   
+    END1x8_2
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_L2 128,16,15,0  
+    KERNEL1x8_L2 128,16,16,0
+    KERNEL1x8_L2 128,16,17,0 
+    KERNEL1x8_L2 128,16,18,0
+    KERNEL1x8_L2 128,16,19,0  
+    KERNEL1x8_L2 128,16,20,0
+    KERNEL1x8_L2 128,16,21,0 
+    KERNEL1x8_L2 128,16,22,0
+    KERNEL1x8_L2 128,16,23,0   
+    KERNEL1x8_L2 128,16,24,0
+    KERNEL1x8_L2 128,16,25,0
+    KERNEL1x8_L2 128,16,26,0
+    KERNEL1x8_L2 128,16,27,0  
+    KERNEL1x8_L2 128,16,28,0
+    KERNEL1x8_L2 128,16,29,0
+    KERNEL1x8_L2 128,16,30,0
+    KERNEL1x8_E2 128,16,31,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_L2 128,16,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 128,16,8,0
+    KERNEL1x8_L2 128,16,9,0
+    KERNEL1x8_L2 128,16,10,0
+    KERNEL1x8_L2 128,16,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 128,16,12,0
+    KERNEL1x8_L2 128,16,13,0
+    KERNEL1x8_L2 128,16,14,0
+    KERNEL1x8_E2 128,16,15,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 128,16,0,0 
+    KERNEL1x8_L2 128,16,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 128,16,2,0
+    KERNEL1x8_L2 128,16,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 128,16,4,0
+    KERNEL1x8_L2 128,16,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 128,16,6,0
+    KERNEL1x8_E2 128,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x4_2  
+    MY_ALIGN
+CGEMM_L1x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,0,0
+CGEMM_L1x4_K32:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_L2 64,16,7,0
+    KERNEL1x4_L2 64,16,8,0
+    KERNEL1x4_L2 64,16,9,0   
+    KERNEL1x4_L2 64,16,10,0
+    KERNEL1x4_L2 64,16,11,0  
+    KERNEL1x4_L2 64,16,12,0
+    KERNEL1x4_L2 64,16,13,0 
+    KERNEL1x4_L2 64,16,14,0
+    KERNEL1x4_L2 64,16,15,1    
+    bdnz    CGEMM_L1x4_LOOP
+    MY_ALIGN  
+CGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/   
+    END1x4_2 
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_L2 64,16,3,0  
+    KERNEL1x4_L2 64,16,4,0
+    KERNEL1x4_L2 64,16,5,0 
+    KERNEL1x4_L2 64,16,6,0
+    KERNEL1x4_E2 64,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 64,16,0,0
+    KERNEL1x4_L2 64,16,1,0   
+    KERNEL1x4_L2 64,16,2,0
+    KERNEL1x4_E2 64,16,3,1 
+    blr
+
+
+CGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x2_2  
+    MY_ALIGN 
+CGEMM_L1x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,0,0 
+CGEMM_L1x2_K32:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_L2 32,16,7,0
+    KERNEL1x2_L2 32,16,8,0
+    KERNEL1x2_L2 32,16,9,0  
+    KERNEL1x2_L2 32,16,10,0
+    KERNEL1x2_L2 32,16,11,0  
+    KERNEL1x2_L2 32,16,12,0
+    KERNEL1x2_L2 32,16,13,0 
+    KERNEL1x2_L2 32,16,14,0
+    KERNEL1x2_L2 32,16,15,1   
+    bdnz    CGEMM_L1x2_LOOP
+    MY_ALIGN  
+
+
+CGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/   
+    END1x2_2 
+    blr
+    MY_ALIGN
+CGEMM_1x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_L2 32,16,3,0  
+    KERNEL1x2_L2 32,16,4,0
+    KERNEL1x2_L2 32,16,5,0 
+    KERNEL1x2_L2 32,16,6,0
+    KERNEL1x2_E2 32,16,7,1
+    blr
+    MY_ALIGN
+CGEMM_1x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 32,16,0,0
+    KERNEL1x2_L2 32,16,1,0  
+    KERNEL1x2_L2 32,16,2,0
+    KERNEL1x2_E2 32,16,3,1  
+    blr
+
+
+CGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x1_2  
+    MY_ALIGN
+CGEMM_L1x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,0,0 
+CGEMM_L1x1_K32:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_L2 16,16,7,0
+    KERNEL1x1_L2 16,16,8,0
+    KERNEL1x1_L2 16,16,9,0  
+    KERNEL1x1_L2 16,16,10,0
+    KERNEL1x1_L2 16,16,11,0  
+    KERNEL1x1_L2 16,16,12,0
+    KERNEL1x1_L2 16,16,13,0 
+    KERNEL1x1_L2 16,16,14,0
+    KERNEL1x1_L2 16,16,15,1   
+    bdnz    CGEMM_L1x1_LOOP
+    MY_ALIGN  
+CGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/   
+    END1x1_2 
+    blr
+
+    MY_ALIGN
+CGEMM_1x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_L2 16,16,3,0  
+    KERNEL1x1_L2 16,16,4,0
+    KERNEL1x1_L2 16,16,5,0 
+    KERNEL1x1_L2 16,16,6,0
+    KERNEL1x1_E2 16,16,7,1
+    blr
+    MY_ALIGN
+
+
+CGEMM_1x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 16,16,0,0
+    KERNEL1x1_L2 16,16,1,0  
+    KERNEL1x1_L2 16,16,2,0
+    KERNEL1x1_E2 16,16,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+CGEMM_L1:
+/*----------------------------------------*/   
+
+    andi.    J,  N,  1
+    ble   CGEMM_L1_END
+
+CGEMM_L1_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C  
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   CGEMM_L1x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+CGEMM_L1x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T1-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO1x8  
+    ble   CGEMM_L1x8_SUB0
+    bl CGEMM_L1x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   CGEMM_L1x8_SAVE
+    b   CGEMM_L1x8_SUB2
+
+
+CGEMM_L1x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP1x8_128K
+    addi BO,BO,-8
+    addi AO,AO,-64 
+    LOAD1x8O 64,8 
+    END1x8_WITHOUT_ADD   
+    LOAD1x8_2O  128, 16 
+    mtctr   T8    
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE  
+    CMP1x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne CGEMM_L1x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-128   
+    LOAD1x8_2O 128,16
+    bl CGEMM_L1x8_K128   
+    b CGEMM_L1x8_SAVE 
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble CGEMM_L1x8_SUB2_32
+    bl  CGEMM_1x8_L64_SUB
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble CGEMM_L1x8_SUB2_16    
+    bl  CGEMM_1x8_L32_SUB
+    MY_ALIGN 
+
+
+CGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x8_SUB2_8
+    bl  CGEMM_1x8_L16_SUB  
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x8_SUB2_4
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_L2  128,16, 1,0
+    KERNEL1x8_L2  128,16, 2,0
+    KERNEL1x8_E2  128,16, 3,1
+    MY_ALIGN   
+
+
+CGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x8_SUB2_2
+    LOAD1x8_2
+    KERNEL1x8_L2  128,16, 0,0
+    KERNEL1x8_E2  128,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x8_SUB2_1
+    LOAD1x8_2 
+    KERNEL1x8_E2  128,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x8_SAVE 
+    KERNEL1x8
+
+    MY_ALIGN
+CGEMM_L1x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    MY_ALIGN
+    SAVE1x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif     
+    bgt   CGEMM_L1x8_BEGIN
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+    b   CGEMM_L1x4_BEGIN
+    MY_ALIGN 
+
+
+CGEMM_L1x8_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   CGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   CGEMM_L1x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x4
+    ble   CGEMM_L1x4_SUB0 
+    bl CGEMM_1x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x4_SAVE
+    b    CGEMM_L1x4_SUB2
+
+
+CGEMM_L1x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x4_32K
+    addi BO,BO,-8
+    addi AO,AO,-32  
+    LOAD1x4O 32,8 
+    END1x4_WITHOUT_ADD   
+    LOAD1x4_2O  64, 16 
+    mtctr   T8    
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE  
+    CMP1x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-64   
+    LOAD1x4_2O 64,16
+    bl CGEMM_L1x4_K32   
+    b CGEMM_L1x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x4_SUB2_8
+    bl  CGEMM_1x4_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x4_SUB2_4
+    bl CGEMM_1x4_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x4_SUB2_2
+    LOAD1x4_2
+    KERNEL1x4_L2  64,16, 0,0
+    KERNEL1x4_E2  64,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x4_SUB2_1
+    LOAD1x4_2
+    KERNEL1x4_E2  64,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x4_SAVE 
+    KERNEL1x4
+
+
+CGEMM_L1x4_SAVE:
+/*----------------------------------------*/   
+    SAVE1x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif     
+
+
+CGEMM_L1x4_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   CGEMM_L1x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x2
+    ble   CGEMM_L1x2_SUB0 
+    bl CGEMM_1x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x2_SAVE
+    b   CGEMM_L1x2_SUB2
+
+
+CGEMM_L1x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x2_32K
+    addi BO,BO,-8
+    addi AO,AO,-16  
+    LOAD1x2O 16,8 
+    END1x2_WITHOUT_ADD   
+    LOAD1x2_2O  32, 16  
+    mtctr   T8    
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE  
+    CMP1x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-32   
+    LOAD1x2_2O 32,16
+    bl CGEMM_L1x2_K32   
+    b CGEMM_L1x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x2_SUB2_8
+    bl CGEMM_1x2_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x2_SUB2_4
+    bl CGEMM_1x2_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x2_SUB2_2
+    LOAD1x2_2
+    KERNEL1x2_L2  32,16, 0,0
+    KERNEL1x2_E2  32,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x2_SUB2_1
+    LOAD1x2_2
+    KERNEL1x2_E2  32,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x2_SAVE 
+    KERNEL1x2
+
+    MY_ALIGN
+CGEMM_L1x2_SAVE:
+/*----------------------------------------*/   
+    SAVE1x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif     
+
+
+CGEMM_L1x2_END:
+/*----------------------------------------*/   
+
+
+CGEMM_L1x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   CGEMM_L1x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T1-2) % 31x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 31x */
+#endif     
+    ZERO1x1
+    ble   CGEMM_L1x1_SUB0 
+    bl CGEMM_1x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   CGEMM_L1x1_SAVE
+    b   CGEMM_L1x1_SUB2
+
+
+CGEMM_L1x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x1_32K
+    addi BO,BO,-8
+    addi AO,AO,-8  
+    LOAD1x1O 8,8 
+    END1x1_WITHOUT_ADD   
+    LOAD1x1_2O  16, 16  
+    mtctr   T8    
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE  
+    CMP1x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne CGEMM_L1x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-16
+    addi AO,AO,-16   
+    LOAD1x1_2O 16,16
+    bl CGEMM_L1x1_K32   
+    b CGEMM_L1x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+CGEMM_L1x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble CGEMM_L1x1_SUB2_8
+    bl CGEMM_1x1_L16_SUB  
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble CGEMM_L1x1_SUB2_4
+    bl CGEMM_1x1_L8_SUB
+    MY_ALIGN  
+
+
+CGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble CGEMM_L1x1_SUB2_2
+    LOAD1x1_2
+    KERNEL1x1_L2  16,16, 0,0
+    KERNEL1x1_E2  16,16, 1,1
+    MY_ALIGN
+
+
+CGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble CGEMM_L1x1_SUB2_1
+    LOAD1x1_2
+    KERNEL1x1_E2  16,16, 0,1
+    MY_ALIGN    
+
+
+CGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble CGEMM_L1x1_SAVE 
+    KERNEL1x1
+
+    MY_ALIGN
+CGEMM_L1x1_SAVE:
+/*----------------------------------------*/  
+     
+    SAVE1x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif   
+
+
+CGEMM_L1x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  3
+
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 1
+#endif   
+
+CGEMM_L1_END:
+
+
+
+
diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S
index a256e1a01..be2b74f01 100644
--- a/kernel/power/cgemm_macros_power9.S
+++ b/kernel/power/cgemm_macros_power9.S
@@ -1,3019 +1,3019 @@
-
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* Abdelrauf(quickwritereader@gmail.com)
-* BLASTEST 	     	: OK
-*  CTEST		    	: OK
-*  TEST			      : OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-#define unit_size 8
-#define DISP32(ind,disp) (ind*unit_size*32+disp)
-#define DISP16(ind,disp) (ind*unit_size*16+disp)
-#define DISP8(ind,disp) (ind*unit_size*8+disp)
-#define DISP4(ind,disp) (ind*unit_size*4+disp)
-#define DISP2(ind,disp) (ind*unit_size*2+disp)
-#define DISP1(ind,disp) (ind*unit_size+disp)
-#define DISPX(disp)  (disp)
-
-.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
-#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
-	xvsubsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
-#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
-	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
-	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubsp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
-#else	// CC || CR || RC || RR 
-    /*we will assume {-alpha_r,-alpha_i} for this case */
-    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
-	xvsubsp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
-    /*we will negate alpha image   instead to fix sign*/
-	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#endif
-.endm
-
-
-.macro  AGGREGATE_REALS_IMAGES_A_PERMUTE  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
-#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
-	xvsubsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
-#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
-	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubsp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
-#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
-	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#else	// CC || CR || RC || RR 
-    /*we will assume {-alpha_r,-alpha_i} for this case */
-    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
-	xvsubsp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
-    /*we will negate alpha image   instead to fix sign*/
-	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#endif
-.endm
- 
-/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
-
-.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
-	xvmulsp \VSOUT1,\VSINII, alpha_i 
-	xvmulsp  \VSOUT2,\VSINRR, alpha_i
-.endm
-
-/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
-
-.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
-	xvmsubasp  \VSOUT1,\VSINRR, alpha_r
-	xvmaddasp \VSOUT2,\VSINII, alpha_r
-.endm
-
-/*                                             macros for N=4 and M=8
-**********************************************************************************************/
-
-.macro Zero4x8
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs42,	vs42,	vs42
-	xxlxor	vs43,	vs43,	vs43
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-	xxlxor	vs46,	vs46,	vs46
-	xxlxor	vs47,	vs47,	vs47
-	xxlxor	vs48,	vs48,	vs48
-	xxlxor	vs49,	vs49,	vs49
-	xxlxor	vs50,	vs50,	vs50
-	xxlxor	vs51,	vs51,	vs51
-	xxlxor	vs52,	vs52,	vs52
-	xxlxor	vs53,	vs53,	vs53
-	xxlxor	vs54,	vs54,	vs54
-	xxlxor	vs55,	vs55,	vs55
-	xxlxor	vs56,	vs56,	vs56
-	xxlxor	vs57,	vs57,	vs57
-	xxlxor	vs58,	vs58,	vs58
-	xxlxor	vs59,	vs59,	vs59
-	xxlxor	vs60,	vs60,	vs60
-	xxlxor	vs61,	vs61,	vs61
-	xxlxor	vs62,	vs62,	vs62
-	xxlxor	vs63,	vs63,	vs63
-.endm
-
-
-.macro LOAD4x8   
-	LOAD4x8O 0,0 
-.endm
-
-
-.macro LOAD4x8O  OffsetA,OffsetB
-	lxv	vs24,	(\OffsetB+0)(BO)
-	lxv	vs28,	(\OffsetB+16)(BO)
-	xxperm  	vs26,	vs24,		permute_mask
-	xxperm  	vs30,	vs28,		permute_mask	  
-	lxv	vs0,	(\OffsetA+0)(AO)
-	lxv	vs1,	(\OffsetA+16)(AO)
-	xxpermdi	vs25,	vs24,	vs24,2	   
-	xxpermdi	vs29,	vs28,	vs28,2	  
-	lxv	vs2,	(\OffsetA+32)(AO)
-	lxv	vs3,	(\OffsetA+48)(AO) 
-	xxpermdi	vs27,	vs26,	vs26,2	
-	xxpermdi	vs31,	vs30,	vs30,2	 	
-.endm
-
-
-.macro END4x8_NORMAL
-	END4x8 AO,BO,64,32
-.endm
-
-
-.macro END4x8_WITHOUT_ADD
-	END4x8 AO,BO,0,0
-.endm
-
-
-.macro END4x8	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs34, vs2,vs24  
-    xvmaddasp       vs35, vs3,vs24  
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs38, vs2,vs25  
-    xvmaddasp       vs39, vs3,vs25 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs42, vs2,vs26  
-    xvmaddasp       vs43, vs3,vs26
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-    xvmaddasp       vs46, vs2,vs27  
-    xvmaddasp       vs47, vs3,vs27
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-    xvmaddasp       vs50, vs2,vs28  
-    xvmaddasp       vs51, vs3,vs28  
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-    xvmaddasp       vs54, vs2,vs29  
-    xvmaddasp       vs55, vs3,vs29
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-    xvmaddasp       vs58, vs2,vs30  
-    xvmaddasp       vs59, vs3,vs30
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-    xvmaddasp       vs62, vs2,vs31  
-    xvmaddasp       vs63, vs3,vs31 
-.endm
-
-
-.macro LOAD4x8_2
-    LOAD4x8_2O 0,0
-.endm
-	
-
-.macro LOAD4x8_2O  OffsetA,OffsetB
-  lxv	vs8,	(\OffsetB)(BO)
-  lxv	vs12,	(16+\OffsetB)(BO)
-  lxv	vs24,	(32+\OffsetB)(BO)
-  lxv	vs28,	(32+16+\OffsetB)(BO)
-  lxv	vs4,	(0+\OffsetA)(AO)
-  lxv	vs5,	(16+\OffsetA)(AO)
-  xxperm  	vs10,	vs8,		permute_mask
-  xxperm  	vs14,	vs12,		permute_mask	
-  lxv	vs6,	(32+\OffsetA)(AO)
-  lxv	vs7,	(48+\OffsetA)(AO) 
-  xxpermdi	vs9,	vs8,	 vs8,2	 
-  xxpermdi	vs13,	vs12,	vs12,2	 
-  lxv	vs0,	(64+\OffsetA)(AO)
-  lxv	vs1,	(64+16+\OffsetA)(AO) 
-  xxpermdi	vs11,	vs10,	vs10,2	
-  xxpermdi	vs15,	vs14,	vs14,2	
-  lxv	vs2,	(64+32+\OffsetA)(AO)
-  lxv	vs3,	(64+48+\OffsetA)(AO)
-  xxperm  	vs26,	vs24,	permute_mask
-  xxperm  	vs30,	vs28,	permute_mask	
-  xxpermdi	vs25,	vs24,	vs24,2 
-  xxpermdi	vs29,	vs28,	vs28,2	      
-  xxpermdi	vs27,	vs26,	vs26,2	
-  xxpermdi	vs31,	vs30,	vs30,2	 
-.endm
-	
-
-.macro END4x8_2	  
-  /*for load2 offset will be 128 and 64*/
-   KERNEL4x8_2	AO,BO,	128,64,0 ,1,1 
-.endm
-
-
-.macro KERNEL4x8_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL4x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL4x8_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL4x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp		vs32, vs4,vs8
-  xvmaddasp		vs33, vs5,vs8
-  xvmaddasp		vs48, vs4,vs12
-  xvmaddasp		vs49, vs5,vs12
-  xvmaddasp		vs40, vs4,vs10
-  xvmaddasp		vs41, vs5,vs10
-  xvmaddasp		vs56, vs4,vs14
-  xvmaddasp		vs57, vs5,vs14
-  xvmaddasp		vs36, vs4,vs9
-  xvmaddasp		vs37, vs5,vs9
-  xvmaddasp		vs52, vs4,vs13
-  xvmaddasp		vs53, vs5,vs13
-  xvmaddasp		vs44, vs4,vs11
-  xvmaddasp		vs45, vs5,vs11
-  xvmaddasp		vs60, vs4,vs15
-  xvmaddasp		vs61, vs5,vs15
-.if \Complete==0	
-   lxv	vs4,	DISP16(\Index,0+\OffsetA)(\AREG)
-   lxv	vs5,	DISP16(\Index,16+\OffsetA)(\AREG)
-.endif
-
-  xvmaddasp		vs34, vs6,vs8	
-  xvmaddasp		vs35, vs7,vs8	
-  xvmaddasp		vs50, vs6,vs12
-  xvmaddasp		vs51, vs7,vs12
-.if \Complete==0  
-  lxv vs8,  DISP8(\Index,\OffsetB)(\BREG)
-  lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
-.endif    
-  xvmaddasp		vs42, vs6,vs10
-  xvmaddasp		vs43, vs7,vs10
-  xvmaddasp		vs58, vs6,vs14
-  xvmaddasp		vs59, vs7,vs14
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask
-  xxperm    vs14, vs12,   permute_mask    
-.endif    
-  xvmaddasp		vs38, vs6,vs9	
-  xvmaddasp		vs39, vs7,vs9	
-  xvmaddasp   vs54, vs6,vs13
-  xvmaddasp   vs55, vs7,vs13
-.if \Complete==0
-  xxpermdi  vs9,  vs8,   vs8,2   
-  xxpermdi  vs13, vs12, vs12,2   
-.endif    
-  xvmaddasp		vs46, vs6,vs11
-  xvmaddasp		vs47, vs7,vs11
-  xvmaddasp		vs62, vs6,vs15
-  xvmaddasp		vs63, vs7,vs15
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi  vs15, vs14, vs14,2  
-.endif  
-.if \Complete==0
-   lxv	vs6,	DISP16(\Index,32+\OffsetA)(\AREG)
-   lxv	vs7,	DISP16(\Index,48+\OffsetA)(\AREG) 
-.endif 
-  xvmaddasp		vs32, vs0,vs24
-  xvmaddasp		vs33, vs1,vs24
-  xvmaddasp		vs48, vs0,vs28
-  xvmaddasp		vs49, vs1,vs28
-  xvmaddasp		vs40, vs0,vs26
-  xvmaddasp		vs41, vs1,vs26
-  xvmaddasp		vs56, vs0,vs30
-  xvmaddasp		vs57, vs1,vs30
-  xvmaddasp		vs36, vs0,vs25
-  xvmaddasp		vs37, vs1,vs25
-  xvmaddasp		vs52, vs0,vs29
-  xvmaddasp		vs53, vs1,vs29
-  xvmaddasp		vs44, vs0,vs27
-  xvmaddasp		vs45, vs1,vs27
-  xvmaddasp		vs60, vs0,vs31
-  xvmaddasp		vs61, vs1,vs31 
-.if \Complete==0
-  lxv	vs0,	DISP16(\Index,64+\OffsetA)(\AREG)
-  lxv	vs1,	DISP16(\Index,64+16+\OffsetA)(\AREG) 
-.endif
-
-  xvmaddasp		vs34, vs2,vs24
-  xvmaddasp		vs35, vs3,vs24	  
-  xvmaddasp		vs50, vs2,vs28
-  xvmaddasp		vs51, vs3,vs28
-.if \Complete==0
-  lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
-  lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
-.endif  
-  xvmaddasp		vs42, vs2,vs26
-  xvmaddasp		vs43, vs3,vs26
-  xvmaddasp		vs58, vs2,vs30
-  xvmaddasp		vs59, vs3,vs30
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask
-  xxperm    vs30, vs28, permute_mask  
-.endif  
-  xvmaddasp		vs38, vs2,vs25
-  xvmaddasp		vs39, vs3,vs25
-  xvmaddasp		vs54, vs2,vs29
-  xvmaddasp		vs55, vs3,vs29
-.if \Complete==0
-  xxpermdi  vs25, vs24, vs24,2 
-  xxpermdi  vs29, vs28, vs28,2    
-.endif  
-  xvmaddasp		vs46, vs2,vs27
-  xvmaddasp		vs47, vs3,vs27
-  xvmaddasp		vs62, vs2,vs31	
-  xvmaddasp		vs63, vs3,vs31
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2  
-  xxpermdi  vs31, vs30, vs30,2   
-.endif
-
-.if \Complete==0
-  lxv	vs2,	DISP16(\Index,64+32+\OffsetA)(\AREG)
-  lxv	vs3,	DISP16(\Index,64+48+\OffsetA)(\AREG)
-.endif
-
-.if \IsLast==1	
-.if \Complete==1
-	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
-.else
-	addi		\BREG, \BREG,  DISP8(\Index,64)
-  addi    \AREG, \AREG, DISP16(\Index,128)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL4x8
-  LOAD4x8
-  END4x8  AO, BO, 64,32
-.endm
-
-
-.macro SAVE4x8
-  add T4, LDC,LDC
-	add	T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs26 , 32(CO)
-  lxv vs27 , 48(CO)
-#endif  
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs28 , 0(T1)
-  lxv vs29 , 16(T1)
-#endif  
-  xxperm  vs2,vs34,permute_mask
-  xxperm  vs6,vs42,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs30 , 32(T1)
-  lxv vs31 , 48(T1)
-#endif 
-  xxperm  vs3,vs35,permute_mask
-  xxperm  vs7,vs43,permute_mask 
-  add T2,CO,T4
-  add T3,T1,T4  
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  xxperm  vs9,vs37,permute_mask
-  xxperm  vs13,vs45,permute_mask
-  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
-  xxperm  vs10,vs38,permute_mask
-  xxperm  vs14,vs46,permute_mask
-  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
-  xxperm  vs11,vs39,permute_mask
-  xxperm  vs15,vs47,permute_mask 
-  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
-  xxperm  vs0,vs48,permute_mask
-  xxperm  vs4,vs56,permute_mask
-  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
-  xxperm  vs1,vs49,permute_mask
-  xxperm  vs5,vs57,permute_mask
-  AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
-  xxperm  vs2,vs50,permute_mask
-  xxperm  vs6,vs58,permute_mask
-  AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 
-  xxperm  vs3,vs51,permute_mask
-  xxperm  vs7,vs59,permute_mask 
-  AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
-  xxperm  vs8,vs52,permute_mask
-  xxperm  vs12,vs60,permute_mask
-  AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
-  xxperm  vs9,vs53,permute_mask
-  xxperm  vs13,vs61,permute_mask
-  AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6
-  xxperm  vs10,vs54,permute_mask
-  xxperm  vs14,vs62,permute_mask
-  AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 
-  xxperm  vs11,vs55,permute_mask
-  xxperm  vs15,vs63,permute_mask 
-  AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
-  AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3  
-  AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15  
-  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
- #ifndef TRMMKERNEL  
-  lxv vs32 , 0(T2)
-  lxv vs40 , 16(T2)
-#endif 
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
-#ifndef TRMMKERNEL  
-  lxv vs33 , 32(T2)
-  lxv vs41 , 48(T2)
-#endif  
-  MULT_APLHA_PART1    vs38,vs46,vs12,vs13
-  MULT_APLHA_PART1    vs39,vs47,vs14,vs15
-#ifndef TRMMKERNEL  
-  lxv vs34 , 0(T3)
-  lxv vs42 , 16(T3)
-#endif  
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
-#ifndef TRMMKERNEL  
-  lxv vs35 , 32(T3)
-  lxv vs43 , 48(T3)
-#endif    
-  MULT_APLHA_PART2    vs38,vs46,vs12,vs13
-  MULT_APLHA_PART2    vs39,vs47,vs14,vs15
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  xxperm  vs4,vs5, save_permute_1
-  xxperm  vs6,vs7, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-  xxperm  vs12,vs13, save_permute_1
-  xxperm  vs14,vs15, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,2
-  xxpermdi vs3,vs10,vs2,2
-  xxpermdi vs5,vs12,vs4,2
-  xxpermdi vs7,vs14,vs6,2
-  xxpermdi vs9,vs0,vs8,2
-  xxpermdi vs11,vs2,vs10,2  
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs25,vs25,vs3
-  xxpermdi vs13,vs4,vs12,2  
-  xxpermdi vs15,vs6,vs14,2
-  xvaddsp vs26,vs26,vs5
-  xvaddsp  vs27,vs27,vs7
-  xvaddsp vs28,vs28,vs9
-  xvaddsp vs29,vs29,vs11 
-  xvaddsp vs30,vs30,vs13
-  xvaddsp vs31,vs31,vs15  
-#else
-  xxpermdi vs24,vs8,vs0,2
-  xxpermdi vs25,vs10,vs2,2
-  xxpermdi vs26,vs12,vs4,2
-  xxpermdi vs27,vs14,vs6,2 
-  xxpermdi vs28,vs0,vs8,2
-  xxpermdi vs29,vs2,vs10,2  
-  xxpermdi vs30,vs4,vs12,2  
-  xxpermdi vs31,vs6,vs14,2
-#endif
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO)
-  MULT_APLHA_PART1    vs48,vs56,vs0,vs1
-  MULT_APLHA_PART1    vs49,vs57,vs2,vs3
-  stxv vs26 , 32(CO)
-  stxv vs27 , 48(CO)
-  MULT_APLHA_PART1    vs50,vs58,vs4,vs5
-  MULT_APLHA_PART1    vs51,vs59,vs6,vs7
-  stxv vs28 , 0(T1)
-  stxv vs29 , 16(T1)
-  MULT_APLHA_PART2    vs48,vs56,vs0,vs1
-  MULT_APLHA_PART2    vs49,vs57,vs2,vs3
-  stxv vs30 , 32(T1)
-  stxv vs31 , 48(T1)  
-  MULT_APLHA_PART2    vs50,vs58,vs4,vs5
-  MULT_APLHA_PART2    vs51,vs59,vs6,vs7
-  MULT_APLHA_PART1    vs52,vs60,vs8,vs9
-  MULT_APLHA_PART1    vs53,vs61,vs10,vs11
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  MULT_APLHA_PART1    vs54,vs62,vs12,vs13
-  MULT_APLHA_PART1    vs55,vs63,vs14,vs15
-  xxperm  vs4,vs5, save_permute_1
-  xxperm  vs6,vs7, save_permute_1
-  MULT_APLHA_PART2    vs52,vs60,vs8,vs9
-  MULT_APLHA_PART2    vs53,vs61,vs10,vs11
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-  MULT_APLHA_PART2    vs54,vs62,vs12,vs13
-  MULT_APLHA_PART2    vs55,vs63,vs14,vs15
-  xxperm  vs12,vs13, save_permute_1
-  xxperm  vs14,vs15, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,2
-  xxpermdi vs3,vs10,vs2,2
-  xxpermdi vs5,vs12,vs4,2
-  xxpermdi vs7,vs14,vs6,2
-  xxpermdi vs9,vs0,vs8,2
-  xxpermdi vs11,vs2,vs10,2  
-  xvaddsp vs32,vs32,vs1
-  xvaddsp vs40,vs40,vs3
-  xxpermdi vs13,vs4,vs12,2  
-  xxpermdi vs15,vs6,vs14,2
-  xvaddsp vs33,vs33,vs5
-  xvaddsp  vs41,vs41,vs7
-  xvaddsp vs34,vs34,vs9
-  xvaddsp vs42,vs42,vs11 
-  xvaddsp vs35,vs35,vs13
-  xvaddsp vs43,vs43,vs15  
-#else
-  xxpermdi vs32,vs8,vs0,2
-  xxpermdi vs40,vs10,vs2,2
-  xxpermdi vs33,vs12,vs4,2
-  xxpermdi vs41,vs14,vs6,2 
-  xxpermdi vs34,vs0,vs8,2
-  xxpermdi vs42,vs2,vs10,2  
-  xxpermdi vs35,vs4,vs12,2  
-  xxpermdi vs43,vs6,vs14,2
-#endif
-  stxv vs32 , 0(T2)
-  stxv vs40 , 16(T2)
-  stxv vs33 , 32(T2)
-  stxv vs41 , 48(T2)
-  stxv vs34 , 0(T3)
-  stxv vs42 , 16(T3)
-  stxv vs35 , 32(T3)
-  stxv vs43 , 48(T3)  
-	addi	CO, CO, 64
-.endm
-
-/*                                             macros for N=4 and M=4
-**********************************************************************************************/
-
-.macro Zero4x4
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-	xxlxor	vs48,	vs48,	vs48
-	xxlxor	vs49,	vs49,	vs49
-	xxlxor	vs52,	vs52,	vs52
-	xxlxor	vs53,	vs53,	vs53
-	xxlxor	vs56,	vs56,	vs56
-	xxlxor	vs57,	vs57,	vs57
-	xxlxor	vs60,	vs60,	vs60
-	xxlxor	vs61,	vs61,	vs61
-.endm
-
-
-.macro LOAD4x4   
-	LOAD4x4O 0,0 
-.endm
-
-
-.macro LOAD4x4O  OffsetA,OffsetB
-	lxv	vs24,	(\OffsetB+0)(BO)
-	lxv	vs28,	(\OffsetB+16)(BO)
-	xxperm  	vs26,	vs24,		permute_mask
-	xxperm  	vs30,	vs28,		permute_mask	  
-	lxv	vs0,	(\OffsetA+0)(AO)
-	lxv	vs1,	(\OffsetA+16)(AO)
-	xxpermdi	vs25,	vs24,	vs24,2	   
-	xxpermdi	vs29,	vs28,	vs28,2	  
-	xxpermdi	vs27,	vs26,	vs26,2	
-	xxpermdi	vs31,	vs30,	vs30,2	 	
-.endm
-
-
-.macro END4x4_NORMAL
-	END4x4 AO,BO,32,32
-.endm
-
-
-.macro END4x4_WITHOUT_ADD
-	END4x4 AO,BO,0,0
-.endm
-
-
-.macro END4x4	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-.endm
-
-
-.macro LOAD4x4_2
-    LOAD4x4_2O 0,0
-.endm
-	
-
-.macro LOAD4x4_2O  OffsetA,OffsetB
-  lxv	vs8,	(\OffsetB)(BO)
-  lxv	vs12,	(16+\OffsetB)(BO)
-  lxv	vs24,	(32+\OffsetB)(BO)
-  lxv	vs28,	(32+16+\OffsetB)(BO)
-  lxv	vs4,	(0+\OffsetA)(AO)
-  lxv	vs5,	(16+\OffsetA)(AO)
-  xxperm  	vs10,	vs8,		permute_mask
-  xxperm  	vs14,	vs12,		permute_mask	
-  xxpermdi	vs9,	vs8,	 vs8,2	 
-  xxpermdi	vs13,	vs12,	vs12,2	 
-  lxv	vs0,	(32+\OffsetA)(AO)
-  lxv	vs1,	(32+16+\OffsetA)(AO) 
-  xxpermdi	vs11,	vs10,	vs10,2	
-  xxpermdi	vs15,	vs14,	vs14,2	
-  xxperm  	vs26,	vs24,	permute_mask
-  xxperm  	vs30,	vs28,	permute_mask	
-  xxpermdi	vs25,	vs24,	vs24,2 
-  xxpermdi	vs29,	vs28,	vs28,2	      
-  xxpermdi	vs27,	vs26,	vs26,2	
-  xxpermdi	vs31,	vs30,	vs30,2	 
-.endm
-
-
-.macro END4x4_2	  
-  /*for load2 offset will be 64 and 64*/
-   KERNEL4x4_2	AO,BO,	64,64,0 ,1,1 
-.endm
-
-
-.macro KERNEL4x4_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL4x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL4x4_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL4x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp		vs32, vs4,vs8
-  xvmaddasp		vs33, vs5,vs8
-  xvmaddasp		vs48, vs4,vs12
-  xvmaddasp		vs49, vs5,vs12
-  xvmaddasp		vs40, vs4,vs10
-  xvmaddasp		vs41, vs5,vs10
-  xvmaddasp		vs56, vs4,vs14
-  xvmaddasp		vs57, vs5,vs14
-.if \Complete==0  
-  lxv vs8,  DISP8(\Index,\OffsetB)(\BREG)
-  lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
-.endif  
-  xvmaddasp		vs36, vs4,vs9
-  xvmaddasp		vs37, vs5,vs9
-  xvmaddasp		vs52, vs4,vs13
-  xvmaddasp		vs53, vs5,vs13
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask
-  xxperm    vs14, vs12,   permute_mask    
-.endif    
-  xvmaddasp		vs44, vs4,vs11
-  xvmaddasp		vs45, vs5,vs11
-  xvmaddasp		vs60, vs4,vs15
-  xvmaddasp		vs61, vs5,vs15
-.if \Complete==0
-  xxpermdi  vs9,  vs8,   vs8,2   
-  xxpermdi  vs13, vs12, vs12,2   
-.endif    
-.if \Complete==0	
-   lxv	vs4,	DISP8(\Index,0+\OffsetA)(\AREG)
-   lxv	vs5,	DISP8(\Index,16+\OffsetA)(\AREG)
-.endif
-
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi  vs15, vs14, vs14,2  
-.endif  
-  xvmaddasp		vs32, vs0,vs24
-  xvmaddasp		vs33, vs1,vs24
-  xvmaddasp		vs48, vs0,vs28
-  xvmaddasp		vs49, vs1,vs28
-  xvmaddasp		vs40, vs0,vs26
-  xvmaddasp		vs41, vs1,vs26
-  xvmaddasp		vs56, vs0,vs30
-  xvmaddasp		vs57, vs1,vs30
-.if \Complete==0
-  lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
-  lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
-.endif   
-  xvmaddasp		vs36, vs0,vs25
-  xvmaddasp		vs37, vs1,vs25
-  xvmaddasp		vs52, vs0,vs29
-  xvmaddasp		vs53, vs1,vs29
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask
-  xxperm    vs30, vs28, permute_mask  
-.endif    
-  xvmaddasp		vs44, vs0,vs27
-  xvmaddasp		vs45, vs1,vs27
-  xvmaddasp		vs60, vs0,vs31
-  xvmaddasp		vs61, vs1,vs31 
-.if \Complete==0
-  xxpermdi  vs25, vs24, vs24,2 
-  xxpermdi  vs29, vs28, vs28,2    
-.endif  
-.if \Complete==0
-  lxv	vs0,	DISP8(\Index,32+\OffsetA)(\AREG)
-  lxv	vs1,	DISP8(\Index,32+16+\OffsetA)(\AREG) 
-.endif
-
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2  
-  xxpermdi  vs31, vs30, vs30,2   
-.endif
-
-.if \IsLast==1	
-.if \Complete==1
-	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
-.else
-	addi		\BREG, \BREG,  DISP8(\Index,64)
-  addi    \AREG, \AREG, DISP8(\Index,64)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL4x4
-  LOAD4x4
-  END4x4  AO, BO, 32,32
-.endm
-
-
-.macro SAVE4x4
-  add T4, LDC,LDC
-  add T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-  add T2,CO,T4
-  add T3,T1,T4  
-#ifndef TRMMKERNEL  
-  lxv vs26 , 0(T1)
-  lxv vs27 , 16(T1)
-#endif  
- #ifndef TRMMKERNEL  
-  lxv vs28 , 0(T2)
-  lxv vs29 , 16(T2)
-#endif
-#ifndef TRMMKERNEL  
-  lxv vs30 , 0(T3)
-  lxv vs31 , 16(T3)
-#endif   
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  xxperm  vs9,vs37,permute_mask
-  xxperm  vs13,vs45,permute_mask
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
-  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
-  xxperm  vs0,vs48,permute_mask
-  xxperm  vs4,vs56,permute_mask
-  xxperm  vs1,vs49,permute_mask
-  xxperm  vs5,vs57,permute_mask 
-  xxperm  vs8,vs52,permute_mask
-  xxperm  vs12,vs60,permute_mask
-  xxperm  vs9,vs53,permute_mask
-  xxperm  vs13,vs61,permute_mask
-  AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
-  AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
-  AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
-  AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART1    vs48,vs56,vs4,vs5
-  MULT_APLHA_PART1    vs49,vs57,vs6,vs7    
-  MULT_APLHA_PART1    vs52,vs60,vs12,vs13
-  MULT_APLHA_PART1    vs53,vs61,vs14,vs15
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART2    vs48,vs56,vs4,vs5
-  MULT_APLHA_PART2    vs49,vs57,vs6,vs7    
-  MULT_APLHA_PART2    vs52,vs60,vs12,vs13
-  MULT_APLHA_PART2    vs53,vs61,vs14,vs15
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-  xxperm  vs4,vs5, save_permute_1
-  xxperm  vs6,vs7, save_permute_1
-  xxperm  vs12,vs13, save_permute_1
-  xxperm  vs14,vs15, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,2
-  xxpermdi vs3,vs10,vs2,2 
-  xxpermdi vs9,vs0,vs8,2
-  xxpermdi vs11,vs2,vs10,2  
-  xxpermdi vs5,vs12,vs4,2
-  xxpermdi vs7,vs14,vs6,2 
-  xxpermdi vs13,vs4,vs12,2
-  xxpermdi vs15,vs6,vs14,2   
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs25,vs25,vs3 
-  xvaddsp vs26,vs26,vs9
-  xvaddsp vs27,vs27,vs11 
-  xvaddsp vs28,vs28,vs5
-  xvaddsp vs29,vs29,vs7 
-  xvaddsp vs30,vs30,vs13
-  xvaddsp vs31,vs31,vs15 
-#else
-  xxpermdi vs24,vs8,vs0,2
-  xxpermdi vs25,vs10,vs2,2
-  xxpermdi vs26,vs0,vs8,2
-  xxpermdi vs27,vs2,vs10,2  
-  xxpermdi vs28,vs12,vs4,2
-  xxpermdi vs29,vs14,vs6,2 
-  xxpermdi vs30,vs4,vs12,2
-  xxpermdi vs31,vs6,vs14,2   
-#endif
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO)
-  stxv vs26 , 0(T1)
-  stxv vs27 , 16(T1)
-  stxv vs28 , 0(T2)
-  stxv vs29 , 16(T2)
-  stxv vs30 , 0(T3)
-  stxv vs31 , 16(T3)  
-  addi  CO, CO, 32
-.endm
-
-/*                                             macros for N=4 and M=2
-**********************************************************************************************/
-
-.macro Zero4x2
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-.endm
-
-
-.macro LOAD4x2   
-	LOAD4x2O 0,0 
-.endm
-
-
-.macro LOAD4x2O  OffsetA,OffsetB
-	lxv	vs24,	(\OffsetA+0)(AO)
-  lxv vs0,  (\OffsetB+0)(BO)
-  lxv vs1,  (\OffsetB+16)(BO)
-	xxperm  	vs26,	vs24,		permute_mask  
-	xxpermdi	vs25,	vs24,	vs24,2	    
-	xxpermdi	vs27,	vs26,	vs26,2	
-.endm
-
-
-.macro END4x2_NORMAL
-	END4x2 AO,BO,16,32
-.endm
-
-
-.macro END4x2_WITHOUT_ADD
-	END4x2 AO,BO,0,0
-.endm
-
-
-.macro END4x2	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-.endm
-
-
-.macro LOAD4x2_2
-    LOAD4x2_2O 0,0
-.endm
-	
-
-.macro LOAD4x2_2O  OffsetA,OffsetB
-  lxv	vs8,	(\OffsetA)(AO) 
-  lxv	vs24,	(16+\OffsetA)(AO) 
-  lxv	vs4,	(0+\OffsetB)(BO)
-  lxv	vs5,	(16+\OffsetB)(BO)
-  xxperm  	vs10,	vs8,		permute_mask
-  xxpermdi	vs9,	vs8,	 vs8,2	 
-  xxperm  	vs26,	vs24,	permute_mask
-  xxpermdi	vs25,	vs24,	vs24,2    
-  lxv vs0,  (32+\OffsetB)(BO)
-  lxv vs1,  (32+16+\OffsetB)(BO) 
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi	vs27,	vs26,	vs26,2	
-.endm
-
-
-.macro END4x2_2	  
-  /*for load2 offset will be 32 and 64*/
-   KERNEL4x2_2	AO,BO,	32,64,0 ,1,1 
-.endm
-
-
-.macro KERNEL4x2_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL4x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL4x2_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL4x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL4x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp		vs32, vs4,vs8
-  xvmaddasp		vs33, vs5,vs8
-  xvmaddasp		vs40, vs4,vs10
-  xvmaddasp		vs41, vs5,vs10
-.if \Complete==0  
-  lxv vs8,  DISP4(\Index,\OffsetA)(\AREG) 
-.endif  
-  xvmaddasp		vs36, vs4,vs9
-  xvmaddasp		vs37, vs5,vs9
-  xvmaddasp   vs44, vs4,vs11
-  xvmaddasp   vs45, vs5,vs11
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask 
-  xxpermdi  vs9,  vs8,   vs8,2  
-.endif    
-.if \Complete==0	
-   lxv	vs4,	DISP8(\Index,0+\OffsetB)(\BREG)
-   lxv	vs5,	DISP8(\Index,16+\OffsetB)(\BREG)
-.endif
-
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2   
-.endif  
-  xvmaddasp		vs32, vs0,vs24
-  xvmaddasp		vs33, vs1,vs24
-  xvmaddasp		vs40, vs0,vs26
-  xvmaddasp		vs41, vs1,vs26
-.if \Complete==0
-  lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) 
-.endif   
-  xvmaddasp		vs36, vs0,vs25
-  xvmaddasp		vs37, vs1,vs25
-  xvmaddasp		vs44, vs0,vs27
-  xvmaddasp		vs45, vs1,vs27
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask 
-  xxpermdi  vs25, vs24, vs24,2    
-.endif  
-.if \Complete==0
-  lxv	vs0,	DISP8(\Index,32+\OffsetB)(\BREG)
-  lxv	vs1,	DISP8(\Index,32+16+\OffsetB)(\BREG) 
-.endif
-
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2    
-.endif
-
-.if \IsLast==1	
-.if \Complete==1
-  addi    \AREG, \AREG, DISP4(\Index,\OffsetA) 
-	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
-.else
-  addi    \AREG, \AREG, DISP4(\Index,32)  
-	addi		\BREG, \BREG,  DISP8(\Index,64)
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL4x2
-  LOAD4x2
-  END4x2  AO, BO, 16,32
-.endm
-
-
-.macro SAVE4x2
-  add T4, LDC,LDC
-  add T1, CO ,LDC  
-  add T2,CO,T4
-  add T3,T1,T4  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO) 
-#endif
-#ifndef TRMMKERNEL  
-  lxv vs25 , 0(T1) 
-#endif  
-#ifndef TRMMKERNEL  
-  lxv vs26 , 0(T2) 
-#endif
-#ifndef TRMMKERNEL  
-  lxv vs27 , 0(T3) 
-#endif   
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask 
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  xxperm  vs9,vs37,permute_mask
-  xxperm  vs13,vs45,permute_mask
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,0
-  xxpermdi vs9,vs10,vs2,0 
-  xxpermdi vs3,vs0,vs8,3
-  xxpermdi vs11,vs2,vs10,3 
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs26,vs26,vs9 
-  xvaddsp vs25,vs25,vs3 
-  xvaddsp vs27,vs27,vs11 
-#else
-  xxpermdi vs24,vs8,vs0,0
-  xxpermdi vs26,vs10,vs2,0 
-  xxpermdi vs25,vs0,vs8,3
-  xxpermdi vs27,vs2,vs10,3 
-#endif
-  stxv vs24 , 0(CO) 
-  stxv vs25 , 0(T1) 
-  stxv vs26 , 0(T2) 
-  stxv vs27 , 0(T3)  
-  addi  CO, CO, 16
-.endm
-
-/*                                             macros for N=4 and M=2
-**********************************************************************************************/
-
-.macro Zero4x1
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs33, vs33, vs33 
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs41, vs41, vs41 
-.endm
-
-
-.macro LOAD4x1   
-  LOAD4x1O 0,0 
-.endm
-
-
-.macro LOAD4x1O  OffsetA,OffsetB
-  lxsd v4, (\OffsetA+0)(AO) 
-  lxv vs0,  (\OffsetB+0)(BO)
-  lxv vs1,  (\OffsetB+16)(BO)
-  xxspltd  vs24,vs36,0
-  xxperm    vs26, vs24,   permute_mask   
-.endm
-
-
-.macro END4x1_NORMAL
-  END4x1 AO,BO,8,32
-.endm
-
-
-.macro END4x1_WITHOUT_ADD
-  END4x1 AO,BO,0,0
-.endm
-
-
-.macro END4x1 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-.endm
-
-
-.macro LOAD4x1_2
-    LOAD4x1_2O 0,0
-.endm
- 
-
-.macro LOAD4x1_2O  OffsetA,OffsetB
-  lxv vs27,  (\OffsetA)(AO) 
-  xxspltd  vs8,vs27,1
-  xxspltd  vs24,vs27,0  
-  lxv vs4,  (0+\OffsetB)(BO)
-  lxv vs5,  (16+\OffsetB)(BO) 
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask      
-  lxv vs0,  (32+\OffsetB)(BO)
-  lxv vs1,  (32+16+\OffsetB)(BO)
-.endm
-
-
-.macro END4x1_2   
-  /*for load2 offset will be 16 and 64*/
-   KERNEL4x1_2  AO,BO,  16,64,0 ,1,1 
-.endm
-
-
-.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL4x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL4x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL4x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs33, vs5,vs8
-  xvmaddasp   vs40, vs4,vs10
-  xvmaddasp   vs41, vs5,vs10
-.if \Complete==0  
-  lxv vs27,  DISP2(\Index,\OffsetA)(\AREG) 
-  xxspltd  vs8,vs27,1 
-.endif  
-.if \Complete==0  
-   lxv  vs4,  DISP8(\Index,0+\OffsetB)(\BREG)
-   lxv  vs5,  DISP8(\Index,16+\OffsetB)(\BREG)
-.endif
-
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask  
-.endif    
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs33, vs1,vs24
-  xvmaddasp   vs40, vs0,vs26
-  xvmaddasp   vs41, vs1,vs26
-.if \Complete==0 
-  xxspltd  vs24,vs27,0  
-  xxperm   vs26, vs24, permute_mask   
-.endif  
-.if \Complete==0
-  lxv vs0,  DISP8(\Index,32+\OffsetB)(\BREG)
-  lxv vs1,  DISP8(\Index,32+16+\OffsetB)(\BREG) 
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \AREG, \AREG, DISP2(\Index,\OffsetA) 
-  addi    \BREG, \BREG,  DISP8(\Index,\OffsetB)
-.else
-  addi    \AREG, \AREG, DISP2(\Index,16)  
-  addi    \BREG, \BREG,  DISP8(\Index,64)
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL4x1
-  LOAD4x1
-  END4x1  AO, BO, 8,32
-.endm
-
-
-.macro SAVE4x1
-  add T4, LDC,LDC
-  add T1, CO ,LDC  
-  add T2,CO,T4
-  add T3,T1,T4  
-#ifndef TRMMKERNEL  
-  lxsd v4 , 0(CO) 
-#endif
-#ifndef TRMMKERNEL  
-  lxsd v5 , 0(T1) 
-#endif  
-#ifndef TRMMKERNEL  
-  lxsd v6 , 0(T2) 
-#endif
-#ifndef TRMMKERNEL  
-  lxsd v7 , 0(T3) 
-#endif   
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask 
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3     
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3    
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxspltd vs1,vs0,0
-  xxspltd vs3,vs0,1
-  xxspltd vs9,vs2,0
-  xxspltd vs11,vs2,1
- /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
-  xvaddsp vs36,vs36,vs1
-  xvaddsp vs37,vs37,vs3   
-  xvaddsp vs38,vs38,vs9  
-  xvaddsp vs39,vs39,vs11 
-#else 
- /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
-  xxspltd vs36,vs0,0
-  xxspltd vs37,vs0,1
-  xxspltd vs38,vs2,0
-  xxspltd vs39,vs2,1
-#endif
-  stxsd v4 , 0(CO) 
-  stxsd v5 , 0(T1) 
-  stxsd v6 , 0(T2) 
-  stxsd v7 , 0(T3)  
-  addi  CO, CO, 8
-.endm
-
-/*                                             macros for N=2 and M=8
-**********************************************************************************************/
-
-.macro Zero2x8
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs33, vs33, vs33
-  xxlxor  vs34, vs34, vs34
-  xxlxor  vs35, vs35, vs35
-  xxlxor  vs36, vs36, vs36
-  xxlxor  vs37, vs37, vs37
-  xxlxor  vs38, vs38, vs38
-  xxlxor  vs39, vs39, vs39
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs41, vs41, vs41
-  xxlxor  vs42, vs42, vs42
-  xxlxor  vs43, vs43, vs43
-  xxlxor  vs44, vs44, vs44
-  xxlxor  vs45, vs45, vs45
-  xxlxor  vs46, vs46, vs46
-  xxlxor  vs47, vs47, vs47
-.endm
-
-
-.macro LOAD2x8   
-  LOAD2x8O 0,0 
-.endm
-
-
-.macro LOAD2x8O  OffsetA,OffsetB
-  lxv vs24, (\OffsetB+0)(BO) 
-  xxperm    vs26, vs24,   permute_mask    
-  lxv vs0,  (\OffsetA+0)(AO)
-  lxv vs1,  (\OffsetA+16)(AO)
-  lxv vs2,  (\OffsetA+32)(AO)
-  lxv vs3,  (\OffsetA+48)(AO) 
-  xxpermdi  vs25, vs24, vs24,2  
-  xxpermdi  vs27, vs26, vs26,2
-.endm
-
-
-.macro END2x8_NORMAL
-  END2x8 AO,BO,64,16
-.endm
-
-
-.macro END2x8_WITHOUT_ADD
-  END2x8 AO,BO,0,0
-.endm
-
-
-.macro END2x8 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs34, vs2,vs24  
-    xvmaddasp       vs35, vs3,vs24  
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs38, vs2,vs25  
-    xvmaddasp       vs39, vs3,vs25 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs42, vs2,vs26  
-    xvmaddasp       vs43, vs3,vs26
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-    xvmaddasp       vs46, vs2,vs27  
-    xvmaddasp       vs47, vs3,vs27
-.endm
-
-
-.macro LOAD2x8_2
-    LOAD2x8_2O 0,0
-.endm
- 
-
-.macro LOAD2x8_2O  OffsetA,OffsetB
-  lxv vs8,  (\OffsetB)(BO)
-  lxv vs24, (16+\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO)
-  lxv vs5,  (16+\OffsetA)(AO)
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask  
-  lxv vs6,  (32+\OffsetA)(AO)
-  lxv vs7,  (48+\OffsetA)(AO) 
-  lxv vs0,  (64+\OffsetA)(AO)
-  lxv vs1,  (64+16+\OffsetA)(AO) 
-  xxpermdi  vs9,  vs8,   vs8,2    
-  xxpermdi  vs25, vs24, vs24,2     
-  lxv vs2,  (64+32+\OffsetA)(AO)
-  lxv vs3,  (64+48+\OffsetA)(AO)
-  xxpermdi  vs11, vs10, vs10,2
-  xxpermdi  vs27, vs26, vs26,2 
-.endm
- 
-
-.macro END2x8_2   
-  /*for load2 offset will be 128 and 32*/
-   KERNEL2x8_2  AO,BO,  128,32,0 ,1,1 
-.endm
-
-
-.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL2x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x8_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs33, vs5,vs8
-  xvmaddasp   vs40, vs4,vs10
-  xvmaddasp   vs41, vs5,vs10
-  xvmaddasp   vs36, vs4,vs9
-  xvmaddasp   vs37, vs5,vs9
-  xvmaddasp   vs44, vs4,vs11
-  xvmaddasp   vs45, vs5,vs11
-.if \Complete==0  
-   lxv  vs4,  DISP16(\Index,0+\OffsetA)(\AREG)
-   lxv  vs5,  DISP16(\Index,16+\OffsetA)(\AREG)
-.endif
-
-  xvmaddasp   vs34, vs6,vs8 
-  xvmaddasp   vs35, vs7,vs8
-.if \Complete==0  
-  lxv vs8,  DISP4(\Index,\OffsetB)(\BREG)
-.endif    
-  xvmaddasp   vs42, vs6,vs10
-  xvmaddasp   vs43, vs7,vs10
-  xvmaddasp   vs38, vs6,vs9 
-  xvmaddasp   vs39, vs7,vs9
-.if \Complete==0
-  xxperm    vs10, vs8,    permute_mask  
-  xxpermdi  vs9,  vs8,   vs8,2   
-.endif    
-  xvmaddasp   vs46, vs6,vs11
-  xvmaddasp   vs47, vs7,vs11
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2   
-.endif  
-.if \Complete==0
-   lxv  vs6,  DISP16(\Index,32+\OffsetA)(\AREG)
-   lxv  vs7,  DISP16(\Index,48+\OffsetA)(\AREG) 
-.endif 
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs33, vs1,vs24
-  xvmaddasp   vs40, vs0,vs26
-  xvmaddasp   vs41, vs1,vs26
-  xvmaddasp   vs36, vs0,vs25
-  xvmaddasp   vs37, vs1,vs25
-  xvmaddasp   vs44, vs0,vs27
-  xvmaddasp   vs45, vs1,vs27
-.if \Complete==0
-  lxv vs0,  DISP16(\Index,64+\OffsetA)(\AREG)
-  lxv vs1,  DISP16(\Index,64+16+\OffsetA)(\AREG) 
-.endif
-
-  xvmaddasp   vs34, vs2,vs24
-  xvmaddasp   vs35, vs3,vs24    
-.if \Complete==0
-  lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
-.endif  
-  xvmaddasp   vs42, vs2,vs26
-  xvmaddasp   vs43, vs3,vs26
-  xvmaddasp   vs38, vs2,vs25
-  xvmaddasp   vs39, vs3,vs25
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask 
-  xxpermdi  vs25, vs24, vs24,2   
-.endif  
-  xvmaddasp   vs46, vs2,vs27
-  xvmaddasp   vs47, vs3,vs27
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2   
-.endif
-
-.if \Complete==0
-  lxv vs2,  DISP16(\Index,64+32+\OffsetA)(\AREG)
-  lxv vs3,  DISP16(\Index,64+48+\OffsetA)(\AREG)
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP4(\Index,32)
-  addi    \AREG, \AREG, DISP16(\Index,128)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL2x8
-  LOAD2x8
-  END2x8  AO, BO, 64,16
-.endm
-
-
-.macro SAVE2x8
-  add T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs26 , 32(CO)
-  lxv vs27 , 48(CO)
-#endif  
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs28 , 0(T1)
-  lxv vs29 , 16(T1)
-#endif  
-  xxperm  vs2,vs34,permute_mask
-  xxperm  vs6,vs42,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs30 , 32(T1)
-  lxv vs31 , 48(T1)
-#endif 
-  xxperm  vs3,vs35,permute_mask
-  xxperm  vs7,vs43,permute_mask 
-  add T2,CO,T4
-  add T3,T1,T4  
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  xxperm  vs9,vs37,permute_mask
-  xxperm  vs13,vs45,permute_mask
-  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
-  xxperm  vs10,vs38,permute_mask
-  xxperm  vs14,vs46,permute_mask
-  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
-  xxperm  vs11,vs39,permute_mask
-  xxperm  vs15,vs47,permute_mask 
-  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
-  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
-  AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
-  AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART1    vs38,vs46,vs12,vs13
-  MULT_APLHA_PART1    vs39,vs47,vs14,vs15
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART2    vs38,vs46,vs12,vs13
-  MULT_APLHA_PART2    vs39,vs47,vs14,vs15
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  xxperm  vs4,vs5, save_permute_1
-  xxperm  vs6,vs7, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-  xxperm  vs12,vs13, save_permute_1
-  xxperm  vs14,vs15, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,2
-  xxpermdi vs3,vs10,vs2,2
-  xxpermdi vs5,vs12,vs4,2
-  xxpermdi vs7,vs14,vs6,2
-  xxpermdi vs9,vs0,vs8,2
-  xxpermdi vs11,vs2,vs10,2  
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs25,vs25,vs3
-  xxpermdi vs13,vs4,vs12,2  
-  xxpermdi vs15,vs6,vs14,2
-  xvaddsp vs26,vs26,vs5
-  xvaddsp  vs27,vs27,vs7
-  xvaddsp vs28,vs28,vs9
-  xvaddsp vs29,vs29,vs11 
-  xvaddsp vs30,vs30,vs13
-  xvaddsp vs31,vs31,vs15  
-#else
-  xxpermdi vs24,vs8,vs0,2
-  xxpermdi vs25,vs10,vs2,2
-  xxpermdi vs26,vs12,vs4,2
-  xxpermdi vs27,vs14,vs6,2 
-  xxpermdi vs28,vs0,vs8,2
-  xxpermdi vs29,vs2,vs10,2  
-  xxpermdi vs30,vs4,vs12,2  
-  xxpermdi vs31,vs6,vs14,2
-#endif
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO) 
-  stxv vs26 , 32(CO)
-  stxv vs27 , 48(CO) 
-  stxv vs28 , 0(T1)
-  stxv vs29 , 16(T1) 
-  stxv vs30 , 32(T1)
-  stxv vs31 , 48(T1)  
-  addi  CO, CO, 64
-.endm
-
-/*                                             macros for N=2 and M=4
-**********************************************************************************************/
-
-.macro Zero2x4
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs33, vs33, vs33
-  xxlxor  vs36, vs36, vs36
-  xxlxor  vs37, vs37, vs37
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs41, vs41, vs41
-  xxlxor  vs44, vs44, vs44
-  xxlxor  vs45, vs45, vs45
-.endm
-
-
-.macro LOAD2x4   
-  LOAD2x4O 0,0 
-.endm
-
-
-.macro LOAD2x4O  OffsetA,OffsetB
-  lxv vs24, (\OffsetB+0)(BO)
-  lxv vs0,  (\OffsetA+0)(AO)
-  lxv vs1,  (\OffsetA+16)(AO)
-  xxperm    vs26, vs24,   permute_mask  
-  xxpermdi  vs25, vs24, vs24,2     
-  xxpermdi  vs27, vs26, vs26,2  
-.endm
-
-
-.macro END2x4_NORMAL
-  END2x4 AO,BO,32,16
-.endm
-
-
-.macro END2x4_WITHOUT_ADD
-  END2x4 AO,BO,0,0
-.endm
-
-
-.macro END2x4 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-.endm
-
-
-.macro LOAD2x4_2
-    LOAD2x4_2O 0,0
-.endm
- 
-
-.macro LOAD2x4_2O  OffsetA,OffsetB
-  lxv vs8,  (\OffsetB)(BO)
-  lxv vs24, (16+\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO)
-  lxv vs5,  (16+\OffsetA)(AO)
-  xxperm    vs10, vs8,    permute_mask
-  xxperm    vs26, vs24, permute_mask
-  xxpermdi  vs9,  vs8,   vs8,2   
-  xxpermdi  vs25, vs24, vs24,2     
-  lxv vs0,  (32+\OffsetA)(AO)
-  lxv vs1,  (32+16+\OffsetA)(AO) 
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi  vs27, vs26, vs26,2  
-.endm
-
-
-.macro END2x4_2   
-  /*for load2 offset will be 64 and 32*/
-   KERNEL2x4_2  AO,BO,  64,32,0 ,1,1 
-.endm
-
-
-.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL2x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x4_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs33, vs5,vs8
-  xvmaddasp   vs40, vs4,vs10
-  xvmaddasp   vs41, vs5,vs10
-.if \Complete==0  
-  lxv vs8,  DISP4(\Index,\OffsetB)(\BREG)
-.endif  
-  xvmaddasp   vs36, vs4,vs9
-  xvmaddasp   vs37, vs5,vs9
-  xvmaddasp   vs44, vs4,vs11
-  xvmaddasp   vs45, vs5,vs11
-.if \Complete==0
-  xxperm    vs10, vs8,    permute_mask 
-  xxpermdi  vs9,  vs8,   vs8,2   
-.endif    
-.if \Complete==0  
-   lxv  vs4,  DISP8(\Index,0+\OffsetA)(\AREG)
-   lxv  vs5,  DISP8(\Index,16+\OffsetA)(\AREG)
-.endif
-
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2 
-.endif  
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs33, vs1,vs24
-  xvmaddasp   vs40, vs0,vs26
-  xvmaddasp   vs41, vs1,vs26
-.if \Complete==0
-  lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
-.endif   
-  xvmaddasp   vs36, vs0,vs25
-  xvmaddasp   vs37, vs1,vs25
-  xvmaddasp   vs44, vs0,vs27
-  xvmaddasp   vs45, vs1,vs27
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask
-  xxpermdi  vs25, vs24, vs24,2 
-.endif  
-.if \Complete==0
-  lxv vs0,  DISP8(\Index,32+\OffsetA)(\AREG)
-  lxv vs1,  DISP8(\Index,32+16+\OffsetA)(\AREG) 
-.endif
-
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2  
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP4(\Index,32)
-  addi    \AREG, \AREG, DISP8(\Index,64)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL2x4
-  LOAD2x4
-  END2x4  AO, BO, 32,16
-.endm
-
-
-.macro SAVE2x4
-  add T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-#ifndef TRMMKERNEL  
-  lxv vs26 , 0(T1)
-  lxv vs27 , 16(T1)
-#endif  
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  xxperm  vs9,vs37,permute_mask
-  xxperm  vs13,vs45,permute_mask
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
-  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs2,vs3, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-  xxperm  vs10,vs11, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,2
-  xxpermdi vs3,vs10,vs2,2 
-  xxpermdi vs9,vs0,vs8,2
-  xxpermdi vs11,vs2,vs10,2  
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs25,vs25,vs3 
-  xvaddsp vs26,vs26,vs9
-  xvaddsp vs27,vs27,vs11 
-#else
-  xxpermdi vs24,vs8,vs0,2
-  xxpermdi vs25,vs10,vs2,2
-  xxpermdi vs26,vs0,vs8,2
-  xxpermdi vs27,vs2,vs10,2  
-#endif
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO)
-  stxv vs26 , 0(T1)
-  stxv vs27 , 16(T1)
-  addi  CO, CO, 32
-.endm
-
-/*                                             macros for N=2 and M=2
-**********************************************************************************************/
-
-.macro Zero2x2
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs36, vs36, vs36
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs44, vs44, vs44
-.endm
-
-
-.macro LOAD2x2   
-  LOAD2x2O 0,0 
-.endm
-
-
-.macro LOAD2x2O  OffsetA,OffsetB
-  lxv vs24, (\OffsetA+0)(AO)
-  lxv vs0,  (\OffsetB+0)(BO)
-  xxperm    vs26, vs24,   permute_mask  
-  xxpermdi  vs25, vs24, vs24,2      
-  xxpermdi  vs27, vs26, vs26,2  
-.endm
-
-
-.macro END2x2_NORMAL
-  END2x2 AO,BO,16,16
-.endm
-
-
-.macro END2x2_WITHOUT_ADD
-  END2x2 AO,BO,0,0
-.endm
-
-
-.macro END2x2 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs44, vs0,vs27
-.endm
-
-
-.macro LOAD2x2_2
-    LOAD2x2_2O 0,0
-.endm
- 
-
-.macro LOAD2x2_2O  OffsetA,OffsetB
-  lxv vs8,  (\OffsetA)(AO) 
-  lxv vs24, (16+\OffsetA)(AO) 
-  lxv vs4,  (0+\OffsetB)(BO)
-  lxv vs0,  (16+\OffsetB)(BO)
-  xxperm    vs10, vs8,    permute_mask
-  xxpermdi  vs9,  vs8,   vs8,2   
-  xxperm    vs26, vs24, permute_mask
-  xxpermdi  vs25, vs24, vs24,2    
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi  vs27, vs26, vs26,2  
-.endm
-
-
-.macro END2x2_2   
-  /*for load2 offset will be 32 and 32*/
-   KERNEL2x2_2  AO,BO,  32,32,0 ,1,1 
-.endm
-
-
-.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL2x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x2_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs40, vs4,vs10
-.if \Complete==0  
-  lxv vs8,  DISP4(\Index,\OffsetA)(\AREG) 
-.endif  
-  xvmaddasp   vs36, vs4,vs9
-  xvmaddasp   vs44, vs4,vs11
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask 
-  xxpermdi  vs9,  vs8,   vs8,2  
-.endif    
-.if \Complete==0  
-   lxv  vs4,  DISP4(\Index,0+\OffsetB)(\BREG)
-.endif
-
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2   
-.endif  
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs40, vs0,vs26
-.if \Complete==0
-  lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) 
-.endif   
-  xvmaddasp   vs36, vs0,vs25
-  xvmaddasp   vs44, vs0,vs27
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask 
-  xxpermdi  vs25, vs24, vs24,2    
-.endif  
-.if \Complete==0
-  lxv vs0,  DISP4(\Index,16+\OffsetB)(\BREG)
-.endif
-
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2    
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \AREG, \AREG, DISP4(\Index,\OffsetA) 
-  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-  addi    \AREG, \AREG, DISP4(\Index,32)  
-  addi    \BREG, \BREG,  DISP4(\Index,32)
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL2x2
-  LOAD2x2
-  END2x2  AO, BO, 16,16
-.endm
-
-
-.macro SAVE2x2
-  add T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO) 
-#endif
-#ifndef TRMMKERNEL  
-  lxv vs26 , 0(T1) 
-#endif  
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs8,vs36,permute_mask
-  xxperm  vs12,vs44,permute_mask
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
-  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1   
-  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1
-  xxperm  vs8,vs9, save_permute_1
-#ifndef TRMMKERNEL
-  /* add */
-  xxpermdi vs1,vs8,vs0,0
-  xxpermdi vs9,vs0,vs8,3 
-  xvaddsp vs24,vs24,vs1
-  xvaddsp vs26,vs26,vs9 
-#else
-  xxpermdi vs24,vs8,vs0,0
-  xxpermdi vs26,vs0,vs8,3 
-#endif
-  stxv vs24 , 0(CO) 
-  stxv vs26 , 0(T1)
-  addi  CO, CO, 16
-.endm
-
-/*                                             macros for N=2 and M=1
-**********************************************************************************************/
-
-.macro Zero2x1
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs40, vs40, vs40
-.endm
-
-
-.macro LOAD2x1   
-  LOAD2x1O 0,0 
-.endm
-
-
-.macro LOAD2x1O  OffsetA,OffsetB
-  lxsd v4, (\OffsetA+0)(AO) 
-  lxv vs0,  (\OffsetB+0)(BO)
-  xxspltd  vs24,vs36,0
-  xxperm    vs26, vs24,   permute_mask   
-.endm
-
-
-.macro END2x1_NORMAL
-  END2x1 AO,BO,8,16
-.endm
-
-
-.macro END2x1_WITHOUT_ADD
-  END2x1 AO,BO,0,0
-.endm
-
-
-.macro END2x1 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs40, vs0,vs26
-.endm
-
-
-.macro LOAD2x1_2
-    LOAD2x1_2O 0,0
-.endm
- 
-
-.macro LOAD2x1_2O  OffsetA,OffsetB
-  lxv vs27,  (\OffsetA)(AO) 
-  lxv vs4,  (0+\OffsetB)(BO)
-  lxv vs0,  (16+\OffsetB)(BO)
-  xxspltd  vs8,vs27,1
-  xxspltd  vs24,vs27,0  
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask      
-.endm
-
-
-.macro END2x1_2   
-  /*for load2 offset will be 16 and 32*/
-   KERNEL2x1_2  AO,BO,  16,32,0 ,1,1 
-.endm
-
-
-.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL2x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs40, vs4,vs10
-.if \Complete==0  
-  lxv vs27,  DISP2(\Index,\OffsetA)(\AREG) 
-  xxspltd  vs8,vs27,1 
-.endif  
-.if \Complete==0  
-   lxv  vs4,  DISP4(\Index,0+\OffsetB)(\BREG)
-.endif
-
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask  
-.endif    
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs40, vs0,vs26
-.if \Complete==0 
-  xxspltd  vs24,vs27,0  
-  xxperm   vs26, vs24, permute_mask   
-.endif  
-.if \Complete==0
-  lxv vs0,  DISP4(\Index,16+\OffsetB)(\BREG)
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \AREG, \AREG, DISP2(\Index,\OffsetA) 
-  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-  addi    \AREG, \AREG, DISP2(\Index,16)  
-  addi    \BREG, \BREG,  DISP4(\Index,32)
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL2x1
-  LOAD2x1
-  END2x1  AO, BO, 8,16
-.endm
-
-
-.macro SAVE2x1
-  add T1, CO ,LDC  
-#ifndef TRMMKERNEL  
-  lxsd v4 , 0(CO) 
-#endif
-#ifndef TRMMKERNEL  
-  lxsd v5 , 0(T1) 
-#endif  
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1      
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, save_permute_1 
-#ifndef TRMMKERNEL
-  /* add */
-  xxspltd vs1,vs0,0
-  xxspltd vs3,vs0,1
- /*--v4==vs36 v5==vs37---*/
-  xvaddsp vs36,vs36,vs1
-  xvaddsp vs37,vs37,vs3  
-#else 
- /*--v4==vs36 v5==vs37---*/
-  xxspltd vs36,vs0,0
-  xxspltd vs37,vs0,1
-#endif
-  stxsd v4 , 0(CO) 
-  stxsd v5 , 0(T1) 
-  addi  CO, CO, 8
-.endm
-
-/*                                             macros for N=1 and M=8
-**********************************************************************************************/
-
-.macro Zero1x8
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs33, vs33, vs33
-  xxlxor  vs34, vs34, vs34
-  xxlxor  vs35, vs35, vs35
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs41, vs41, vs41
-  xxlxor  vs42, vs42, vs42
-  xxlxor  vs43, vs43, vs43
-.endm
-
-
-.macro LOAD1x8   
-  LOAD1x8O 0,0 
-.endm
-
-
-.macro LOAD1x8O  OffsetA,OffsetB
-  lxsd vs4, (\OffsetB+0)(BO) 
-  lxv vs0,  (\OffsetA+0)(AO)
-  lxv vs1,  (\OffsetA+16)(AO)
-  lxv vs2,  (\OffsetA+32)(AO)
-  lxv vs3,  (\OffsetA+48)(AO) 
-  xxspltd   vs24,vs36,0
-  xxperm    vs26, vs24,   permute_mask    
-.endm
-
-
-.macro END1x8_NORMAL
-  END1x8 AO,BO,64,8
-.endm
-
-
-.macro END1x8_WITHOUT_ADD
-  END1x8 AO,BO,0,0
-.endm
-
-
-.macro END1x8 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs34, vs2,vs24  
-    xvmaddasp       vs35, vs3,vs24  
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs42, vs2,vs26  
-    xvmaddasp       vs43, vs3,vs26
-.endm
-
-
-.macro LOAD1x8_2
-    LOAD1x8_2O 0,0
-.endm
- 
-
-.macro LOAD1x8_2O  OffsetA,OffsetB
-  lxv vs27,  (\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO)
-  lxv vs5,  (16+\OffsetA)(AO)
-  xxspltd  vs8,vs27,1
-  xxspltd  vs24,vs27,0    
-  lxv vs6,  (32+\OffsetA)(AO)
-  lxv vs7,  (48+\OffsetA)(AO) 
-  lxv vs0,  (64+\OffsetA)(AO)
-  lxv vs1,  (64+16+\OffsetA)(AO)     
-  lxv vs2,  (64+32+\OffsetA)(AO)
-  lxv vs3,  (64+48+\OffsetA)(AO)
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask   
-.endm
- 
-
-.macro END1x8_2   
-  /*for load2 offset will be 128 and 16*/
-   KERNEL1x8_2  AO,BO,  128,16,0 ,1,1 
-.endm
-
-
-.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL1x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x8_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-.if \Complete==0  
-  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
-.endif    
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs33, vs5,vs8
-  xvmaddasp   vs40, vs4,vs10
-  xvmaddasp   vs41, vs5,vs10
-.if \Complete==0  
-   lxv  vs4,  DISP16(\Index,0+\OffsetA)(\AREG)
-   lxv  vs5,  DISP16(\Index,16+\OffsetA)(\AREG)
-.endif
-
-  xvmaddasp   vs34, vs6,vs8 
-  xvmaddasp   vs35, vs7,vs8
-  xvmaddasp   vs42, vs6,vs10
-  xvmaddasp   vs43, vs7,vs10
-.if \Complete==0
-   lxv  vs6,  DISP16(\Index,32+\OffsetA)(\AREG)
-   lxv  vs7,  DISP16(\Index,48+\OffsetA)(\AREG) 
-.endif 
-.if \Complete==0 
-  xxspltd  vs8,vs27,1    
-  xxperm    vs10, vs8,    permute_mask   
-.endif    
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs33, vs1,vs24
-  xvmaddasp   vs40, vs0,vs26
-  xvmaddasp   vs41, vs1,vs26
-.if \Complete==0
-  lxv vs0,  DISP16(\Index,64+\OffsetA)(\AREG)
-  lxv vs1,  DISP16(\Index,64+16+\OffsetA)(\AREG) 
-.endif
-
-  xvmaddasp   vs34, vs2,vs24
-  xvmaddasp   vs35, vs3,vs24    
-  xvmaddasp   vs42, vs2,vs26
-  xvmaddasp   vs43, vs3,vs26
-.if \Complete==0
-  xxspltd  vs24,vs27,0   
-  xxperm    vs26, vs24, permute_mask  
-.endif  
-.if \Complete==0
-  lxv vs2,  DISP16(\Index,64+32+\OffsetA)(\AREG)
-  lxv vs3,  DISP16(\Index,64+48+\OffsetA)(\AREG)
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP2(\Index,16)
-  addi    \AREG, \AREG, DISP16(\Index,128)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL1x8
-  LOAD1x8
-  END1x8  AO, BO, 64,8
-.endm
-
-
-.macro SAVE1x8
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-#ifndef TRMMKERNEL  
-  lxv vs26 , 32(CO)
-  lxv vs27 , 48(CO)
-#endif  
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-  xxperm  vs2,vs34,permute_mask
-  xxperm  vs6,vs42,permute_mask
-  xxperm  vs3,vs35,permute_mask
-  xxperm  vs7,vs43,permute_mask 
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
-  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
-  /*inner reverse save_permute and store vs28 */
-  xxpermdi vs28,save_permute_1,save_permute_1,2
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
-  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, vs28
-  xxperm  vs2,vs3, vs28
-  xxperm  vs4,vs5, vs28
-  xxperm  vs6,vs7, vs28  
-#ifndef TRMMKERNEL
-  /* add */
-  xvaddsp vs24,vs24,vs0
-  xvaddsp vs25,vs25,vs2
-  xvaddsp vs26,vs26,vs4
-  xvaddsp  vs27,vs27,vs6
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO) 
-  stxv vs26 , 32(CO)
-  stxv vs27 , 48(CO)    
-#else
-/* reconstruct r,i pairs*/
-  stxv vs0 , 0(CO)
-  stxv vs2 , 16(CO) 
-  stxv vs4 , 32(CO)
-  stxv vs6 , 48(CO)  
-#endif
-  addi  CO, CO, 64
-.endm
-
-/*                                             macros for N=1 and M=4
-**********************************************************************************************/
-
-.macro Zero1x4
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs33, vs33, vs33
-  xxlxor  vs40, vs40, vs40
-  xxlxor  vs41, vs41, vs41
-.endm
-
-
-.macro LOAD1x4   
-  LOAD1x4O 0,0 
-.endm
-
-
-.macro LOAD1x4O  OffsetA,OffsetB
-  lxsd vs4, (\OffsetB+0)(BO) 
-  lxv vs0,  (\OffsetA+0)(AO)
-  lxv vs1,  (\OffsetA+16)(AO)
-  xxspltd   vs24,vs36,0
-  xxperm    vs26, vs24,   permute_mask    
-.endm
-
-
-.macro END1x4_NORMAL
-  END1x4 AO,BO,32,8
-.endm
-
-
-.macro END1x4_WITHOUT_ADD
-  END1x4 AO,BO,0,0
-.endm
-
-
-.macro END1x4 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-.endm
-
-
-.macro LOAD1x4_2
-    LOAD1x4_2O 0,0
-.endm
- 
-
-.macro LOAD1x4_2O  OffsetA,OffsetB
-  lxv vs27,  (\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO)
-  lxv vs5,  (16+\OffsetA)(AO)
-  xxspltd  vs8,vs27,1
-  xxspltd  vs24,vs27,0    
-  lxv vs0,  (32+\OffsetA)(AO)
-  lxv vs1,  (32+16+\OffsetA)(AO)     
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask   
-.endm
- 
-
-.macro END1x4_2   
-  /*for load2 offset will be 64 and 16*/
-   KERNEL1x4_2  AO,BO,  64,16,0 ,1,1 
-.endm
-
-
-.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL1x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x4_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-.if \Complete==0  
-  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
-.endif    
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs33, vs5,vs8
-  xvmaddasp   vs40, vs4,vs10
-  xvmaddasp   vs41, vs5,vs10
-.if \Complete==0  
-   lxv  vs4,  DISP8(\Index,0+\OffsetA)(\AREG)
-   lxv  vs5,  DISP8(\Index,16+\OffsetA)(\AREG)
-.endif
-
-.if \Complete==0 
-  xxspltd  vs8,vs27,1    
-  xxperm    vs10, vs8,    permute_mask   
-.endif    
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs33, vs1,vs24
-  xvmaddasp   vs40, vs0,vs26
-  xvmaddasp   vs41, vs1,vs26
-.if \Complete==0
-  lxv vs0,  DISP8(\Index,32+\OffsetA)(\AREG)
-  lxv vs1,  DISP8(\Index,32+16+\OffsetA)(\AREG) 
-.endif
-
-.if \Complete==0
-  xxspltd  vs24,vs27,0   
-  xxperm    vs26, vs24, permute_mask  
-.endif  
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP2(\Index,16)
-  addi    \AREG, \AREG, DISP8(\Index,64)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL1x4
-  LOAD1x4
-  END1x4  AO, BO, 32,8
-.endm
-
-
-.macro SAVE1x4
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-  lxv vs25 , 16(CO)
-#endif
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  xxperm  vs1,vs33,permute_mask
-  xxperm  vs5,vs41,permute_mask
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
-  /*inner reverse save_permute and store vs28 */
-  xxpermdi vs28,save_permute_1,save_permute_1,2
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
-  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, vs28
-  xxperm  vs2,vs3, vs28
-#ifndef TRMMKERNEL
-  /* add */
-  xvaddsp vs24,vs24,vs0
-  xvaddsp vs25,vs25,vs2
-  stxv vs24 , 0(CO)
-  stxv vs25 , 16(CO) 
-#else
-/* reconstruct r,i pairs*/
-  stxv vs0 , 0(CO)
-  stxv vs2 , 16(CO) 
-#endif
-  addi  CO, CO, 32
-.endm
-
-/*                                             macros for N=1 and M=2
-**********************************************************************************************/
-
-.macro Zero1x2
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs40, vs40, vs40
-.endm
-
-
-.macro LOAD1x2   
-  LOAD1x2O 0,0 
-.endm
-
-
-.macro LOAD1x2O  OffsetA,OffsetB
-  lxsd vs4, (\OffsetB+0)(BO) 
-  lxv vs0,  (\OffsetA+0)(AO)
-  xxspltd   vs24,vs36,0
-  xxperm    vs26, vs24,   permute_mask    
-.endm
-
-
-.macro END1x2_NORMAL
-  END1x2 AO,BO,16,8
-.endm
-
-
-.macro END1x2_WITHOUT_ADD
-  END1x2 AO,BO,0,0
-.endm
-
-
-.macro END1x2 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs40, vs0,vs26
-.endm
-
-
-.macro LOAD1x2_2
-    LOAD1x2_2O 0,0
-.endm
- 
-
-.macro LOAD1x2_2O  OffsetA,OffsetB
-  lxv vs27,  (\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO)
-  lxv vs0,  (16+\OffsetA)(AO)
-  xxspltd  vs8,vs27,1
-  xxspltd  vs24,vs27,0    
-  xxperm    vs10, vs8,    permute_mask 
-  xxperm    vs26, vs24, permute_mask   
-.endm
- 
-
-.macro END1x2_2   
-  /*for load2 offset will be 32 and 16*/
-   KERNEL1x2_2  AO,BO,  32,16,0 ,1,1 
-.endm
-
-
-.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL1x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x2_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-.if \Complete==0  
-  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
-.endif    
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs40, vs4,vs10
-.if \Complete==0  
-   lxv  vs4,  DISP4(\Index,0+\OffsetA)(\AREG)
-.endif
-
-.if \Complete==0 
-  xxspltd  vs8,vs27,1    
-  xxperm    vs10, vs8,    permute_mask   
-.endif    
-  xvmaddasp   vs32, vs0,vs24
-  xvmaddasp   vs40, vs0,vs26
-.if \Complete==0
-  lxv vs0,  DISP4(\Index,16+\OffsetA)(\AREG)
-.endif
-
-.if \Complete==0
-  xxspltd  vs24,vs27,0   
-  xxperm    vs26, vs24, permute_mask  
-.endif  
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP4(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP2(\Index,16)
-  addi    \AREG, \AREG, DISP4(\Index,32)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL1x2
-  LOAD1x2
-  END1x2  AO, BO, 16,8
-.endm
-
-
-.macro SAVE1x2
-#ifndef TRMMKERNEL  
-  lxv vs24 , 0(CO)
-#endif
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  /*inner reverse save_permute and store vs28 */
-  xxpermdi vs28,save_permute_1,save_permute_1,2
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
-  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
-/* reconstruct r,i pairs*/
-  xxperm  vs0,vs1, vs28
-#ifndef TRMMKERNEL
-  /* add */
-  xvaddsp vs24,vs24,vs0
-  stxv vs24 , 0(CO)
-#else
-/* reconstruct r,i pairs*/
-  stxv vs0 , 0(CO)
-#endif
-  addi  CO, CO, 16
-.endm
-
-/*                                             macros for N=1 and M=1
-**********************************************************************************************/
-.macro Zero1x1
-  xxlxor  vs32, vs32, vs32
-  xxlxor  vs40, vs40, vs40
-.endm
-
-
-.macro LOAD1x1   
-  LOAD1x1O 0,0 
-.endm
-
-
-.macro LOAD1x1O  OffsetA,OffsetB
-  lxsd v4, (\OffsetB+0)(BO) 
-  lxsd v5,  (\OffsetA+0)(AO)
-  xxperm    vs38, vs36,   permute_mask    
-.endm
-
-
-.macro END1x1_NORMAL
-  END1x1 AO,BO,8,8
-.endm
-
-
-.macro END1x1_WITHOUT_ADD
-  END1x1 AO,BO,0,0
-.endm
-
-
-.macro END1x1 AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-  addi  \BREG, \BREG, \OffsetB
-.endif
-
-.if \OffsetA != 0
-  addi  \AREG, \AREG, \OffsetA
-.endif
-
-    xvmaddasp       vs32, vs37,vs36
-    xvmaddasp       vs40, vs37,vs38
-.endm
-
-
-.macro LOAD1x1_2
-    LOAD1x1_2O 0,0
-.endm
- 
-
-.macro LOAD1x1_2O  OffsetA,OffsetB
-  lxv vs8,  (\OffsetB)(BO)
-  lxv vs4,  (0+\OffsetA)(AO) 
-  xxperm    vs10, vs8,    permute_mask  
-.endm
- 
-
-.macro END1x1_2   
-  /*for load2 offset will be 16 and 16*/
-   KERNEL1x1_2  AO,BO,  16,16,0 ,1,1 
-.endm
-
-
-.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
-  KERNEL1x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
- 
-  xvmaddasp   vs32, vs4,vs8
-  xvmaddasp   vs40, vs4,vs10
-.if \Complete==0  
-  lxv vs8,  DISP2(\Index,\OffsetB)(\BREG)
-  lxv vs4,  DISP2(\Index,\OffsetB)(\AREG)
-  xxperm    vs10, vs8,    permute_mask  
-.endif
-
-.if \IsLast==1  
-.if \Complete==1
-  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP2(\Index,\OffsetA)  
-.else
-  addi    \BREG, \BREG,  DISP2(\Index,16)
-  addi    \AREG, \AREG, DISP2(\Index,16)  
-.endif
-
-.endif   
-.endm
-
-
-.macro KERNEL1x1
-  LOAD1x1
-  END1x1  AO, BO, 8,8
-.endm
-
-
-.macro SAVE1x1
-#ifndef TRMMKERNEL  
-  lxsd v4 , 0(CO)
-#endif
-  /*aggregate x2*/
-  xxpermdi vs33,vs32,vs32,2
-  xxpermdi vs41,vs40,vs40,2 
-  xvaddsp vs32,vs32,vs33
-  xvaddsp vs40,vs40,vs41
-
-  xxperm  vs0,vs32,permute_mask
-  xxperm  vs4,vs40,permute_mask
-  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
-  /*inner reverse save_permute and store vs28 */
-  xxpermdi vs28,save_permute_1,save_permute_1,2
-  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
-  MULT_APLHA_PART1    vs32,vs40,vs37,vs1 
-  MULT_APLHA_PART2    vs32,vs40,vs37,vs1    
-
-/* reconstruct r,i pairs*/
-  xxperm  vs37,vs1, vs28  
-
-#ifndef TRMMKERNEL
-  /* add */
-  xvaddsp vs36,vs36,vs37
-  stxsd v4 , 0(CO)
-#else
-
-/* vs37 is v5 */
-  stxsd v5 , 0(CO)
-#endif
-  addi  CO, CO, 8
-.endm
-
- 
- 
-
-/****************************TRMM POINTER REFRESH MACROSES*************************/
-
-
-.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
-		.if \SHIFT_VAL==16 
-			slwi		\REG1,	\REG2,	7			
-		.elseif \SHIFT_VAL==8  
-			slwi		\REG1,	\REG2,	6			 
-		.elseif \SHIFT_VAL==4
-			slwi		\REG1,	\REG2,	5			  
-		.elseif \SHIFT_VAL==2
-			slwi		\REG1,	\REG2,	4			 
-		.elseif \SHIFT_VAL==1
-			slwi		\REG1,	\REG2,	3			 
-		.endif
-.endm
-
-/*
-//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		ptrbb = bb;
-// #else
-// 		ptrba += off*8;
-// 		ptrbb = bb + off*4;
-// #endif
-*/
-.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
-    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
-        /* ptrbb = bb;*/
-        mr \PTR_B,\B_VAL     /* refresh BPOINT */
-
-    #else
-		    /*
-        // ptrba  =ptrba+ off*C_A;
-        // ptrbb = bb + off*C_B; 
-				*/
-		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
-		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
-		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
-		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
-    #endif 
-.endm
-
-
-/*
-// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-// 		temp = bk-off;
-// #elif defined(LEFT)
-// 		temp = off+8;	// number of values in A
-// #else
-// 		temp = off+4;	// number of values in B
-// #endif
-*/
-.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
-    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
-                            /* temp = bk-off;*/
-           sub \TEMP_BK,\BK_VAL,\OFF_VAL
-
-    #elif defined(LEFT)
-                            /* temp = off+INCR_A;	// number of values in A */
-           addi \TEMP_BK, \OFF_VAL, \INCR_A
-    #else
-                            /* temp = off+INCR_B	// number of values in B*/
-           addi \TEMP_BK,\OFF_VAL, \INCR_B
-    #endif
-
-.endm
-/*
-// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		temp = bk - off;
-// #ifdef LEFT
-// 		temp -= 8; // number of values in A
-// #else
-// 		temp -= 4; // number of values in B
-// #endif
-// 		ptrba += temp*8;
-// 		ptrbb += temp*4;
-// #endif
-
-// #ifdef LEFT
-// 		off += 8; // number of values in A
-// #endif
-*/
- 
-
-.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
-
-    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-                    /*temp = bk - off;*/
-                sub \TEMP_BK,\BK_VAL,\OFF_VAL
-    #ifdef LEFT
-                    /*temp -= 8; // number of values in A*/
-                addi \TEMP_BK,\TEMP_BK,-\C_A
-    #else
-                    /*temp -= 4; // number of values in B*/
-                addi \TEMP_BK,\TEMP_BK,-\C_B 
-    #endif
-                    /*ptrba += temp*C_A;
-                    ptrbb += temp*C_B;*/ 
-                SHIFT_REG T4,\TEMP_BK,\C_A
-								SHIFT_REG T2,\TEMP_BK,\C_B
-                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
-								add \PTR_B, \PTR_B,T2 
-
-    #endif
-
-    #ifdef LEFT
-                    /*off += 8; // number of values in A*/
-                 addi \OFF_VAL,\OFF_VAL,\C_A
-    #endif
+
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@gmail.com)
+* BLASTEST 	     	: OK
+*  CTEST		    	: OK
+*  TEST			      : OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+#define unit_size 8
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp)  (disp)
+
+.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+	xvsubsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
+#else	// CC || CR || RC || RR 
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubsp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#endif
+.endm
+
+
+.macro  AGGREGATE_REALS_IMAGES_A_PERMUTE  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+	xvsubsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
+	xvaddsp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#else	// CC || CR || RC || RR 
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubsp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
+    /*we will negate alpha image   instead to fix sign*/
+	xvaddsp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#endif
+.endm
+ 
+/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */
+
+.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
+	xvmulsp \VSOUT1,\VSINII, alpha_i 
+	xvmulsp  \VSOUT2,\VSINRR, alpha_i
+.endm
+
+/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
+	xvmsubasp  \VSOUT1,\VSINRR, alpha_r
+	xvmaddasp \VSOUT2,\VSINII, alpha_r
+.endm
+
+/*                                             macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro Zero4x8
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
+	xxlxor	vs48,	vs48,	vs48
+	xxlxor	vs49,	vs49,	vs49
+	xxlxor	vs50,	vs50,	vs50
+	xxlxor	vs51,	vs51,	vs51
+	xxlxor	vs52,	vs52,	vs52
+	xxlxor	vs53,	vs53,	vs53
+	xxlxor	vs54,	vs54,	vs54
+	xxlxor	vs55,	vs55,	vs55
+	xxlxor	vs56,	vs56,	vs56
+	xxlxor	vs57,	vs57,	vs57
+	xxlxor	vs58,	vs58,	vs58
+	xxlxor	vs59,	vs59,	vs59
+	xxlxor	vs60,	vs60,	vs60
+	xxlxor	vs61,	vs61,	vs61
+	xxlxor	vs62,	vs62,	vs62
+	xxlxor	vs63,	vs63,	vs63
+.endm
+
+
+.macro LOAD4x8   
+	LOAD4x8O 0,0 
+.endm
+
+
+.macro LOAD4x8O  OffsetA,OffsetB
+	lxv	vs24,	(\OffsetB+0)(BO)
+	lxv	vs28,	(\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	  
+	lxv	vs0,	(\OffsetA+0)(AO)
+	lxv	vs1,	(\OffsetA+16)(AO)
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+	lxv	vs2,	(\OffsetA+32)(AO)
+	lxv	vs3,	(\OffsetA+48)(AO) 
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	 	
+.endm
+
+
+.macro END4x8_NORMAL
+	END4x8 AO,BO,64,32
+.endm
+
+
+.macro END4x8_WITHOUT_ADD
+	END4x8 AO,BO,0,0
+.endm
+
+
+.macro END4x8	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+    xvmaddasp       vs50, vs2,vs28  
+    xvmaddasp       vs51, vs3,vs28  
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    xvmaddasp       vs54, vs2,vs29  
+    xvmaddasp       vs55, vs3,vs29
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+    xvmaddasp       vs58, vs2,vs30  
+    xvmaddasp       vs59, vs3,vs30
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+    xvmaddasp       vs62, vs2,vs31  
+    xvmaddasp       vs63, vs3,vs31 
+.endm
+
+
+.macro LOAD4x8_2
+    LOAD4x8_2O 0,0
+.endm
+	
+
+.macro LOAD4x8_2O  OffsetA,OffsetB
+  lxv	vs8,	(\OffsetB)(BO)
+  lxv	vs12,	(16+\OffsetB)(BO)
+  lxv	vs24,	(32+\OffsetB)(BO)
+  lxv	vs28,	(32+16+\OffsetB)(BO)
+  lxv	vs4,	(0+\OffsetA)(AO)
+  lxv	vs5,	(16+\OffsetA)(AO)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxperm  	vs14,	vs12,		permute_mask	
+  lxv	vs6,	(32+\OffsetA)(AO)
+  lxv	vs7,	(48+\OffsetA)(AO) 
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxpermdi	vs13,	vs12,	vs12,2	 
+  lxv	vs0,	(64+\OffsetA)(AO)
+  lxv	vs1,	(64+16+\OffsetA)(AO) 
+  xxpermdi	vs11,	vs10,	vs10,2	
+  xxpermdi	vs15,	vs14,	vs14,2	
+  lxv	vs2,	(64+32+\OffsetA)(AO)
+  lxv	vs3,	(64+48+\OffsetA)(AO)
+  xxperm  	vs26,	vs24,	permute_mask
+  xxperm  	vs30,	vs28,	permute_mask	
+  xxpermdi	vs25,	vs24,	vs24,2 
+  xxpermdi	vs29,	vs28,	vs28,2	      
+  xxpermdi	vs27,	vs26,	vs26,2	
+  xxpermdi	vs31,	vs30,	vs30,2	 
+.endm
+	
+
+.macro END4x8_2	  
+  /*for load2 offset will be 128 and 64*/
+   KERNEL4x8_2	AO,BO,	128,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x8_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x8_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs48, vs4,vs12
+  xvmaddasp		vs49, vs5,vs12
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+  xvmaddasp		vs56, vs4,vs14
+  xvmaddasp		vs57, vs5,vs14
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp		vs52, vs4,vs13
+  xvmaddasp		vs53, vs5,vs13
+  xvmaddasp		vs44, vs4,vs11
+  xvmaddasp		vs45, vs5,vs11
+  xvmaddasp		vs60, vs4,vs15
+  xvmaddasp		vs61, vs5,vs15
+.if \Complete==0	
+   lxv	vs4,	DISP16(\Index,0+\OffsetA)(\AREG)
+   lxv	vs5,	DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp		vs34, vs6,vs8	
+  xvmaddasp		vs35, vs7,vs8	
+  xvmaddasp		vs50, vs6,vs12
+  xvmaddasp		vs51, vs7,vs12
+.if \Complete==0  
+  lxv vs8,  DISP8(\Index,\OffsetB)(\BREG)
+  lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif    
+  xvmaddasp		vs42, vs6,vs10
+  xvmaddasp		vs43, vs7,vs10
+  xvmaddasp		vs58, vs6,vs14
+  xvmaddasp		vs59, vs7,vs14
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs14, vs12,   permute_mask    
+.endif    
+  xvmaddasp		vs38, vs6,vs9	
+  xvmaddasp		vs39, vs7,vs9	
+  xvmaddasp   vs54, vs6,vs13
+  xvmaddasp   vs55, vs7,vs13
+.if \Complete==0
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs13, vs12, vs12,2   
+.endif    
+  xvmaddasp		vs46, vs6,vs11
+  xvmaddasp		vs47, vs7,vs11
+  xvmaddasp		vs62, vs6,vs15
+  xvmaddasp		vs63, vs7,vs15
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs15, vs14, vs14,2  
+.endif  
+.if \Complete==0
+   lxv	vs6,	DISP16(\Index,32+\OffsetA)(\AREG)
+   lxv	vs7,	DISP16(\Index,48+\OffsetA)(\AREG) 
+.endif 
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs48, vs0,vs28
+  xvmaddasp		vs49, vs1,vs28
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+  xvmaddasp		vs56, vs0,vs30
+  xvmaddasp		vs57, vs1,vs30
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs52, vs0,vs29
+  xvmaddasp		vs53, vs1,vs29
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+  xvmaddasp		vs60, vs0,vs31
+  xvmaddasp		vs61, vs1,vs31 
+.if \Complete==0
+  lxv	vs0,	DISP16(\Index,64+\OffsetA)(\AREG)
+  lxv	vs1,	DISP16(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp		vs34, vs2,vs24
+  xvmaddasp		vs35, vs3,vs24	  
+  xvmaddasp		vs50, vs2,vs28
+  xvmaddasp		vs51, vs3,vs28
+.if \Complete==0
+  lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp		vs42, vs2,vs26
+  xvmaddasp		vs43, vs3,vs26
+  xvmaddasp		vs58, vs2,vs30
+  xvmaddasp		vs59, vs3,vs30
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxperm    vs30, vs28, permute_mask  
+.endif  
+  xvmaddasp		vs38, vs2,vs25
+  xvmaddasp		vs39, vs3,vs25
+  xvmaddasp		vs54, vs2,vs29
+  xvmaddasp		vs55, vs3,vs29
+.if \Complete==0
+  xxpermdi  vs25, vs24, vs24,2 
+  xxpermdi  vs29, vs28, vs28,2    
+.endif  
+  xvmaddasp		vs46, vs2,vs27
+  xvmaddasp		vs47, vs3,vs27
+  xvmaddasp		vs62, vs2,vs31	
+  xvmaddasp		vs63, vs3,vs31
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+  xxpermdi  vs31, vs30, vs30,2   
+.endif
+
+.if \Complete==0
+  lxv	vs2,	DISP16(\Index,64+32+\OffsetA)(\AREG)
+  lxv	vs3,	DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1	
+.if \Complete==1
+	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
+.else
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+  addi    \AREG, \AREG, DISP16(\Index,128)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x8
+  LOAD4x8
+  END4x8  AO, BO, 64,32
+.endm
+
+
+.macro SAVE4x8
+  add T4, LDC,LDC
+	add	T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs26 , 32(CO)
+  lxv vs27 , 48(CO)
+#endif  
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs28 , 0(T1)
+  lxv vs29 , 16(T1)
+#endif  
+  xxperm  vs2,vs34,permute_mask
+  xxperm  vs6,vs42,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs30 , 32(T1)
+  lxv vs31 , 48(T1)
+#endif 
+  xxperm  vs3,vs35,permute_mask
+  xxperm  vs7,vs43,permute_mask 
+  add T2,CO,T4
+  add T3,T1,T4  
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+  xxperm  vs10,vs38,permute_mask
+  xxperm  vs14,vs46,permute_mask
+  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
+  xxperm  vs11,vs39,permute_mask
+  xxperm  vs15,vs47,permute_mask 
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  xxperm  vs0,vs48,permute_mask
+  xxperm  vs4,vs56,permute_mask
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  xxperm  vs1,vs49,permute_mask
+  xxperm  vs5,vs57,permute_mask
+  AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
+  xxperm  vs2,vs50,permute_mask
+  xxperm  vs6,vs58,permute_mask
+  AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 
+  xxperm  vs3,vs51,permute_mask
+  xxperm  vs7,vs59,permute_mask 
+  AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
+  xxperm  vs8,vs52,permute_mask
+  xxperm  vs12,vs60,permute_mask
+  AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
+  xxperm  vs9,vs53,permute_mask
+  xxperm  vs13,vs61,permute_mask
+  AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6
+  xxperm  vs10,vs54,permute_mask
+  xxperm  vs14,vs62,permute_mask
+  AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 
+  xxperm  vs11,vs55,permute_mask
+  xxperm  vs15,vs63,permute_mask 
+  AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
+  AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3  
+  AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15  
+  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
+ #ifndef TRMMKERNEL  
+  lxv vs32 , 0(T2)
+  lxv vs40 , 16(T2)
+#endif 
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+#ifndef TRMMKERNEL  
+  lxv vs33 , 32(T2)
+  lxv vs41 , 48(T2)
+#endif  
+  MULT_APLHA_PART1    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART1    vs39,vs47,vs14,vs15
+#ifndef TRMMKERNEL  
+  lxv vs34 , 0(T3)
+  lxv vs42 , 16(T3)
+#endif  
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+#ifndef TRMMKERNEL  
+  lxv vs35 , 32(T3)
+  lxv vs43 , 48(T3)
+#endif    
+  MULT_APLHA_PART2    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART2    vs39,vs47,vs14,vs15
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3
+  xxpermdi vs13,vs4,vs12,2  
+  xxpermdi vs15,vs6,vs14,2
+  xvaddsp vs26,vs26,vs5
+  xvaddsp  vs27,vs27,vs7
+  xvaddsp vs28,vs28,vs9
+  xvaddsp vs29,vs29,vs11 
+  xvaddsp vs30,vs30,vs13
+  xvaddsp vs31,vs31,vs15  
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs12,vs4,2
+  xxpermdi vs27,vs14,vs6,2 
+  xxpermdi vs28,vs0,vs8,2
+  xxpermdi vs29,vs2,vs10,2  
+  xxpermdi vs30,vs4,vs12,2  
+  xxpermdi vs31,vs6,vs14,2
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO)
+  MULT_APLHA_PART1    vs48,vs56,vs0,vs1
+  MULT_APLHA_PART1    vs49,vs57,vs2,vs3
+  stxv vs26 , 32(CO)
+  stxv vs27 , 48(CO)
+  MULT_APLHA_PART1    vs50,vs58,vs4,vs5
+  MULT_APLHA_PART1    vs51,vs59,vs6,vs7
+  stxv vs28 , 0(T1)
+  stxv vs29 , 16(T1)
+  MULT_APLHA_PART2    vs48,vs56,vs0,vs1
+  MULT_APLHA_PART2    vs49,vs57,vs2,vs3
+  stxv vs30 , 32(T1)
+  stxv vs31 , 48(T1)  
+  MULT_APLHA_PART2    vs50,vs58,vs4,vs5
+  MULT_APLHA_PART2    vs51,vs59,vs6,vs7
+  MULT_APLHA_PART1    vs52,vs60,vs8,vs9
+  MULT_APLHA_PART1    vs53,vs61,vs10,vs11
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  MULT_APLHA_PART1    vs54,vs62,vs12,vs13
+  MULT_APLHA_PART1    vs55,vs63,vs14,vs15
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  MULT_APLHA_PART2    vs52,vs60,vs8,vs9
+  MULT_APLHA_PART2    vs53,vs61,vs10,vs11
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  MULT_APLHA_PART2    vs54,vs62,vs12,vs13
+  MULT_APLHA_PART2    vs55,vs63,vs14,vs15
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs32,vs32,vs1
+  xvaddsp vs40,vs40,vs3
+  xxpermdi vs13,vs4,vs12,2  
+  xxpermdi vs15,vs6,vs14,2
+  xvaddsp vs33,vs33,vs5
+  xvaddsp  vs41,vs41,vs7
+  xvaddsp vs34,vs34,vs9
+  xvaddsp vs42,vs42,vs11 
+  xvaddsp vs35,vs35,vs13
+  xvaddsp vs43,vs43,vs15  
+#else
+  xxpermdi vs32,vs8,vs0,2
+  xxpermdi vs40,vs10,vs2,2
+  xxpermdi vs33,vs12,vs4,2
+  xxpermdi vs41,vs14,vs6,2 
+  xxpermdi vs34,vs0,vs8,2
+  xxpermdi vs42,vs2,vs10,2  
+  xxpermdi vs35,vs4,vs12,2  
+  xxpermdi vs43,vs6,vs14,2
+#endif
+  stxv vs32 , 0(T2)
+  stxv vs40 , 16(T2)
+  stxv vs33 , 32(T2)
+  stxv vs41 , 48(T2)
+  stxv vs34 , 0(T3)
+  stxv vs42 , 16(T3)
+  stxv vs35 , 32(T3)
+  stxv vs43 , 48(T3)  
+	addi	CO, CO, 64
+.endm
+
+/*                                             macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro Zero4x4
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs48,	vs48,	vs48
+	xxlxor	vs49,	vs49,	vs49
+	xxlxor	vs52,	vs52,	vs52
+	xxlxor	vs53,	vs53,	vs53
+	xxlxor	vs56,	vs56,	vs56
+	xxlxor	vs57,	vs57,	vs57
+	xxlxor	vs60,	vs60,	vs60
+	xxlxor	vs61,	vs61,	vs61
+.endm
+
+
+.macro LOAD4x4   
+	LOAD4x4O 0,0 
+.endm
+
+
+.macro LOAD4x4O  OffsetA,OffsetB
+	lxv	vs24,	(\OffsetB+0)(BO)
+	lxv	vs28,	(\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	  
+	lxv	vs0,	(\OffsetA+0)(AO)
+	lxv	vs1,	(\OffsetA+16)(AO)
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	 	
+.endm
+
+
+.macro END4x4_NORMAL
+	END4x4 AO,BO,32,32
+.endm
+
+
+.macro END4x4_WITHOUT_ADD
+	END4x4 AO,BO,0,0
+.endm
+
+
+.macro END4x4	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+.endm
+
+
+.macro LOAD4x4_2
+    LOAD4x4_2O 0,0
+.endm
+	
+
+.macro LOAD4x4_2O  OffsetA,OffsetB
+  lxv	vs8,	(\OffsetB)(BO)
+  lxv	vs12,	(16+\OffsetB)(BO)
+  lxv	vs24,	(32+\OffsetB)(BO)
+  lxv	vs28,	(32+16+\OffsetB)(BO)
+  lxv	vs4,	(0+\OffsetA)(AO)
+  lxv	vs5,	(16+\OffsetA)(AO)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxperm  	vs14,	vs12,		permute_mask	
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxpermdi	vs13,	vs12,	vs12,2	 
+  lxv	vs0,	(32+\OffsetA)(AO)
+  lxv	vs1,	(32+16+\OffsetA)(AO) 
+  xxpermdi	vs11,	vs10,	vs10,2	
+  xxpermdi	vs15,	vs14,	vs14,2	
+  xxperm  	vs26,	vs24,	permute_mask
+  xxperm  	vs30,	vs28,	permute_mask	
+  xxpermdi	vs25,	vs24,	vs24,2 
+  xxpermdi	vs29,	vs28,	vs28,2	      
+  xxpermdi	vs27,	vs26,	vs26,2	
+  xxpermdi	vs31,	vs30,	vs30,2	 
+.endm
+
+
+.macro END4x4_2	  
+  /*for load2 offset will be 64 and 64*/
+   KERNEL4x4_2	AO,BO,	64,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x4_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x4_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs48, vs4,vs12
+  xvmaddasp		vs49, vs5,vs12
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+  xvmaddasp		vs56, vs4,vs14
+  xvmaddasp		vs57, vs5,vs14
+.if \Complete==0  
+  lxv vs8,  DISP8(\Index,\OffsetB)(\BREG)
+  lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp		vs52, vs4,vs13
+  xvmaddasp		vs53, vs5,vs13
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs14, vs12,   permute_mask    
+.endif    
+  xvmaddasp		vs44, vs4,vs11
+  xvmaddasp		vs45, vs5,vs11
+  xvmaddasp		vs60, vs4,vs15
+  xvmaddasp		vs61, vs5,vs15
+.if \Complete==0
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs13, vs12, vs12,2   
+.endif    
+.if \Complete==0	
+   lxv	vs4,	DISP8(\Index,0+\OffsetA)(\AREG)
+   lxv	vs5,	DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs15, vs14, vs14,2  
+.endif  
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs48, vs0,vs28
+  xvmaddasp		vs49, vs1,vs28
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+  xvmaddasp		vs56, vs0,vs30
+  xvmaddasp		vs57, vs1,vs30
+.if \Complete==0
+  lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG)
+.endif   
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs52, vs0,vs29
+  xvmaddasp		vs53, vs1,vs29
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxperm    vs30, vs28, permute_mask  
+.endif    
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+  xvmaddasp		vs60, vs0,vs31
+  xvmaddasp		vs61, vs1,vs31 
+.if \Complete==0
+  xxpermdi  vs25, vs24, vs24,2 
+  xxpermdi  vs29, vs28, vs28,2    
+.endif  
+.if \Complete==0
+  lxv	vs0,	DISP8(\Index,32+\OffsetA)(\AREG)
+  lxv	vs1,	DISP8(\Index,32+16+\OffsetA)(\AREG) 
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+  xxpermdi  vs31, vs30, vs30,2   
+.endif
+
+.if \IsLast==1	
+.if \Complete==1
+	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
+.else
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+  addi    \AREG, \AREG, DISP8(\Index,64)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x4
+  LOAD4x4
+  END4x4  AO, BO, 32,32
+.endm
+
+
+.macro SAVE4x4
+  add T4, LDC,LDC
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  add T2,CO,T4
+  add T3,T1,T4  
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T1)
+  lxv vs27 , 16(T1)
+#endif  
+ #ifndef TRMMKERNEL  
+  lxv vs28 , 0(T2)
+  lxv vs29 , 16(T2)
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs30 , 0(T3)
+  lxv vs31 , 16(T3)
+#endif   
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  xxperm  vs0,vs48,permute_mask
+  xxperm  vs4,vs56,permute_mask
+  xxperm  vs1,vs49,permute_mask
+  xxperm  vs5,vs57,permute_mask 
+  xxperm  vs8,vs52,permute_mask
+  xxperm  vs12,vs60,permute_mask
+  xxperm  vs9,vs53,permute_mask
+  xxperm  vs13,vs61,permute_mask
+  AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4
+  AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5
+  AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12
+  AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART1    vs48,vs56,vs4,vs5
+  MULT_APLHA_PART1    vs49,vs57,vs6,vs7    
+  MULT_APLHA_PART1    vs52,vs60,vs12,vs13
+  MULT_APLHA_PART1    vs53,vs61,vs14,vs15
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs48,vs56,vs4,vs5
+  MULT_APLHA_PART2    vs49,vs57,vs6,vs7    
+  MULT_APLHA_PART2    vs52,vs60,vs12,vs13
+  MULT_APLHA_PART2    vs53,vs61,vs14,vs15
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2 
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2 
+  xxpermdi vs13,vs4,vs12,2
+  xxpermdi vs15,vs6,vs14,2   
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3 
+  xvaddsp vs26,vs26,vs9
+  xvaddsp vs27,vs27,vs11 
+  xvaddsp vs28,vs28,vs5
+  xvaddsp vs29,vs29,vs7 
+  xvaddsp vs30,vs30,vs13
+  xvaddsp vs31,vs31,vs15 
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs0,vs8,2
+  xxpermdi vs27,vs2,vs10,2  
+  xxpermdi vs28,vs12,vs4,2
+  xxpermdi vs29,vs14,vs6,2 
+  xxpermdi vs30,vs4,vs12,2
+  xxpermdi vs31,vs6,vs14,2   
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO)
+  stxv vs26 , 0(T1)
+  stxv vs27 , 16(T1)
+  stxv vs28 , 0(T2)
+  stxv vs29 , 16(T2)
+  stxv vs30 , 0(T3)
+  stxv vs31 , 16(T3)  
+  addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro Zero4x2
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+.endm
+
+
+.macro LOAD4x2   
+	LOAD4x2O 0,0 
+.endm
+
+
+.macro LOAD4x2O  OffsetA,OffsetB
+	lxv	vs24,	(\OffsetA+0)(AO)
+  lxv vs0,  (\OffsetB+0)(BO)
+  lxv vs1,  (\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask  
+	xxpermdi	vs25,	vs24,	vs24,2	    
+	xxpermdi	vs27,	vs26,	vs26,2	
+.endm
+
+
+.macro END4x2_NORMAL
+	END4x2 AO,BO,16,32
+.endm
+
+
+.macro END4x2_WITHOUT_ADD
+	END4x2 AO,BO,0,0
+.endm
+
+
+.macro END4x2	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+.endm
+
+
+.macro LOAD4x2_2
+    LOAD4x2_2O 0,0
+.endm
+	
+
+.macro LOAD4x2_2O  OffsetA,OffsetB
+  lxv	vs8,	(\OffsetA)(AO) 
+  lxv	vs24,	(16+\OffsetA)(AO) 
+  lxv	vs4,	(0+\OffsetB)(BO)
+  lxv	vs5,	(16+\OffsetB)(BO)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxperm  	vs26,	vs24,	permute_mask
+  xxpermdi	vs25,	vs24,	vs24,2    
+  lxv vs0,  (32+\OffsetB)(BO)
+  lxv vs1,  (32+16+\OffsetB)(BO) 
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi	vs27,	vs26,	vs26,2	
+.endm
+
+
+.macro END4x2_2	  
+  /*for load2 offset will be 32 and 64*/
+   KERNEL4x2_2	AO,BO,	32,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x2_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x2_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL4x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetA)(\AREG) 
+.endif  
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp   vs44, vs4,vs11
+  xvmaddasp   vs45, vs5,vs11
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask 
+  xxpermdi  vs9,  vs8,   vs8,2  
+.endif    
+.if \Complete==0	
+   lxv	vs4,	DISP8(\Index,0+\OffsetB)(\BREG)
+   lxv	vs5,	DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2   
+.endif  
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) 
+.endif   
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask 
+  xxpermdi  vs25, vs24, vs24,2    
+.endif  
+.if \Complete==0
+  lxv	vs0,	DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv	vs1,	DISP8(\Index,32+16+\OffsetB)(\BREG) 
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2    
+.endif
+
+.if \IsLast==1	
+.if \Complete==1
+  addi    \AREG, \AREG, DISP4(\Index,\OffsetA) 
+	addi		\BREG, \BREG,  DISP8(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP4(\Index,32)  
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x2
+  LOAD4x2
+  END4x2  AO, BO, 16,32
+.endm
+
+
+.macro SAVE4x2
+  add T4, LDC,LDC
+  add T1, CO ,LDC  
+  add T2,CO,T4
+  add T3,T1,T4  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs25 , 0(T1) 
+#endif  
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T2) 
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs27 , 0(T3) 
+#endif   
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask 
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,0
+  xxpermdi vs9,vs10,vs2,0 
+  xxpermdi vs3,vs0,vs8,3
+  xxpermdi vs11,vs2,vs10,3 
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs26,vs26,vs9 
+  xvaddsp vs25,vs25,vs3 
+  xvaddsp vs27,vs27,vs11 
+#else
+  xxpermdi vs24,vs8,vs0,0
+  xxpermdi vs26,vs10,vs2,0 
+  xxpermdi vs25,vs0,vs8,3
+  xxpermdi vs27,vs2,vs10,3 
+#endif
+  stxv vs24 , 0(CO) 
+  stxv vs25 , 0(T1) 
+  stxv vs26 , 0(T2) 
+  stxv vs27 , 0(T3)  
+  addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=4 and M=2
+**********************************************************************************************/
+
+.macro Zero4x1
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33 
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41 
+.endm
+
+
+.macro LOAD4x1   
+  LOAD4x1O 0,0 
+.endm
+
+
+.macro LOAD4x1O  OffsetA,OffsetB
+  lxsd v4, (\OffsetA+0)(AO) 
+  lxv vs0,  (\OffsetB+0)(BO)
+  lxv vs1,  (\OffsetB+16)(BO)
+  xxspltd  vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask   
+.endm
+
+
+.macro END4x1_NORMAL
+  END4x1 AO,BO,8,32
+.endm
+
+
+.macro END4x1_WITHOUT_ADD
+  END4x1 AO,BO,0,0
+.endm
+
+
+.macro END4x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+.endm
+
+
+.macro LOAD4x1_2
+    LOAD4x1_2O 0,0
+.endm
+ 
+
+.macro LOAD4x1_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetA)(AO) 
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0  
+  lxv vs4,  (0+\OffsetB)(BO)
+  lxv vs5,  (16+\OffsetB)(BO) 
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask      
+  lxv vs0,  (32+\OffsetB)(BO)
+  lxv vs1,  (32+16+\OffsetB)(BO)
+.endm
+
+
+.macro END4x1_2   
+  /*for load2 offset will be 16 and 64*/
+   KERNEL4x1_2  AO,BO,  16,64,0 ,1,1 
+.endm
+
+
+.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL4x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL4x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL4x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetA)(\AREG) 
+  xxspltd  vs8,vs27,1 
+.endif  
+.if \Complete==0  
+   lxv  vs4,  DISP8(\Index,0+\OffsetB)(\BREG)
+   lxv  vs5,  DISP8(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask  
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0 
+  xxspltd  vs24,vs27,0  
+  xxperm   vs26, vs24, permute_mask   
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP8(\Index,32+\OffsetB)(\BREG)
+  lxv vs1,  DISP8(\Index,32+16+\OffsetB)(\BREG) 
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \AREG, \AREG, DISP2(\Index,\OffsetA) 
+  addi    \BREG, \BREG,  DISP8(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP2(\Index,16)  
+  addi    \BREG, \BREG,  DISP8(\Index,64)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL4x1
+  LOAD4x1
+  END4x1  AO, BO, 8,32
+.endm
+
+
+.macro SAVE4x1
+  add T4, LDC,LDC
+  add T1, CO ,LDC  
+  add T2,CO,T4
+  add T3,T1,T4  
+#ifndef TRMMKERNEL  
+  lxsd v4 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxsd v5 , 0(T1) 
+#endif  
+#ifndef TRMMKERNEL  
+  lxsd v6 , 0(T2) 
+#endif
+#ifndef TRMMKERNEL  
+  lxsd v7 , 0(T3) 
+#endif   
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask 
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3     
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3    
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxspltd vs1,vs0,0
+  xxspltd vs3,vs0,1
+  xxspltd vs9,vs2,0
+  xxspltd vs11,vs2,1
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+  xvaddsp vs36,vs36,vs1
+  xvaddsp vs37,vs37,vs3   
+  xvaddsp vs38,vs38,vs9  
+  xvaddsp vs39,vs39,vs11 
+#else 
+ /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/
+  xxspltd vs36,vs0,0
+  xxspltd vs37,vs0,1
+  xxspltd vs38,vs2,0
+  xxspltd vs39,vs2,1
+#endif
+  stxsd v4 , 0(CO) 
+  stxsd v5 , 0(T1) 
+  stxsd v6 , 0(T2) 
+  stxsd v7 , 0(T3)  
+  addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro Zero2x8
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs34, vs34, vs34
+  xxlxor  vs35, vs35, vs35
+  xxlxor  vs36, vs36, vs36
+  xxlxor  vs37, vs37, vs37
+  xxlxor  vs38, vs38, vs38
+  xxlxor  vs39, vs39, vs39
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+  xxlxor  vs42, vs42, vs42
+  xxlxor  vs43, vs43, vs43
+  xxlxor  vs44, vs44, vs44
+  xxlxor  vs45, vs45, vs45
+  xxlxor  vs46, vs46, vs46
+  xxlxor  vs47, vs47, vs47
+.endm
+
+
+.macro LOAD2x8   
+  LOAD2x8O 0,0 
+.endm
+
+
+.macro LOAD2x8O  OffsetA,OffsetB
+  lxv vs24, (\OffsetB+0)(BO) 
+  xxperm    vs26, vs24,   permute_mask    
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  lxv vs2,  (\OffsetA+32)(AO)
+  lxv vs3,  (\OffsetA+48)(AO) 
+  xxpermdi  vs25, vs24, vs24,2  
+  xxpermdi  vs27, vs26, vs26,2
+.endm
+
+
+.macro END2x8_NORMAL
+  END2x8 AO,BO,64,16
+.endm
+
+
+.macro END2x8_WITHOUT_ADD
+  END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+.endm
+
+
+.macro LOAD2x8_2
+    LOAD2x8_2O 0,0
+.endm
+ 
+
+.macro LOAD2x8_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetB)(BO)
+  lxv vs24, (16+\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask  
+  lxv vs6,  (32+\OffsetA)(AO)
+  lxv vs7,  (48+\OffsetA)(AO) 
+  lxv vs0,  (64+\OffsetA)(AO)
+  lxv vs1,  (64+16+\OffsetA)(AO) 
+  xxpermdi  vs9,  vs8,   vs8,2    
+  xxpermdi  vs25, vs24, vs24,2     
+  lxv vs2,  (64+32+\OffsetA)(AO)
+  lxv vs3,  (64+48+\OffsetA)(AO)
+  xxpermdi  vs11, vs10, vs10,2
+  xxpermdi  vs27, vs26, vs26,2 
+.endm
+ 
+
+.macro END2x8_2   
+  /*for load2 offset will be 128 and 32*/
+   KERNEL2x8_2  AO,BO,  128,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x8_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+  xvmaddasp   vs36, vs4,vs9
+  xvmaddasp   vs37, vs5,vs9
+  xvmaddasp   vs44, vs4,vs11
+  xvmaddasp   vs45, vs5,vs11
+.if \Complete==0  
+   lxv  vs4,  DISP16(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp   vs34, vs6,vs8 
+  xvmaddasp   vs35, vs7,vs8
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs42, vs6,vs10
+  xvmaddasp   vs43, vs7,vs10
+  xvmaddasp   vs38, vs6,vs9 
+  xvmaddasp   vs39, vs7,vs9
+.if \Complete==0
+  xxperm    vs10, vs8,    permute_mask  
+  xxpermdi  vs9,  vs8,   vs8,2   
+.endif    
+  xvmaddasp   vs46, vs6,vs11
+  xvmaddasp   vs47, vs7,vs11
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2   
+.endif  
+.if \Complete==0
+   lxv  vs6,  DISP16(\Index,32+\OffsetA)(\AREG)
+   lxv  vs7,  DISP16(\Index,48+\OffsetA)(\AREG) 
+.endif 
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+  xvmaddasp   vs36, vs0,vs25
+  xvmaddasp   vs37, vs1,vs25
+  xvmaddasp   vs44, vs0,vs27
+  xvmaddasp   vs45, vs1,vs27
+.if \Complete==0
+  lxv vs0,  DISP16(\Index,64+\OffsetA)(\AREG)
+  lxv vs1,  DISP16(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp   vs34, vs2,vs24
+  xvmaddasp   vs35, vs3,vs24    
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp   vs42, vs2,vs26
+  xvmaddasp   vs43, vs3,vs26
+  xvmaddasp   vs38, vs2,vs25
+  xvmaddasp   vs39, vs3,vs25
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask 
+  xxpermdi  vs25, vs24, vs24,2   
+.endif  
+  xvmaddasp   vs46, vs2,vs27
+  xvmaddasp   vs47, vs3,vs27
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2   
+.endif
+
+.if \Complete==0
+  lxv vs2,  DISP16(\Index,64+32+\OffsetA)(\AREG)
+  lxv vs3,  DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+  addi    \AREG, \AREG, DISP16(\Index,128)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x8
+  LOAD2x8
+  END2x8  AO, BO, 64,16
+.endm
+
+
+.macro SAVE2x8
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs26 , 32(CO)
+  lxv vs27 , 48(CO)
+#endif  
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs28 , 0(T1)
+  lxv vs29 , 16(T1)
+#endif  
+  xxperm  vs2,vs34,permute_mask
+  xxperm  vs6,vs42,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs30 , 32(T1)
+  lxv vs31 , 48(T1)
+#endif 
+  xxperm  vs3,vs35,permute_mask
+  xxperm  vs7,vs43,permute_mask 
+  add T2,CO,T4
+  add T3,T1,T4  
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+  xxperm  vs10,vs38,permute_mask
+  xxperm  vs14,vs46,permute_mask
+  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
+  xxperm  vs11,vs39,permute_mask
+  xxperm  vs15,vs47,permute_mask 
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14
+  AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART1    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART1    vs39,vs47,vs14,vs15
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs38,vs46,vs12,vs13
+  MULT_APLHA_PART2    vs39,vs47,vs14,vs15
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs4,vs5, save_permute_1
+  xxperm  vs6,vs7, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+  xxperm  vs12,vs13, save_permute_1
+  xxperm  vs14,vs15, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2
+  xxpermdi vs5,vs12,vs4,2
+  xxpermdi vs7,vs14,vs6,2
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3
+  xxpermdi vs13,vs4,vs12,2  
+  xxpermdi vs15,vs6,vs14,2
+  xvaddsp vs26,vs26,vs5
+  xvaddsp  vs27,vs27,vs7
+  xvaddsp vs28,vs28,vs9
+  xvaddsp vs29,vs29,vs11 
+  xvaddsp vs30,vs30,vs13
+  xvaddsp vs31,vs31,vs15  
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs12,vs4,2
+  xxpermdi vs27,vs14,vs6,2 
+  xxpermdi vs28,vs0,vs8,2
+  xxpermdi vs29,vs2,vs10,2  
+  xxpermdi vs30,vs4,vs12,2  
+  xxpermdi vs31,vs6,vs14,2
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO) 
+  stxv vs26 , 32(CO)
+  stxv vs27 , 48(CO) 
+  stxv vs28 , 0(T1)
+  stxv vs29 , 16(T1) 
+  stxv vs30 , 32(T1)
+  stxv vs31 , 48(T1)  
+  addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=2 and M=4
+**********************************************************************************************/
+
+.macro Zero2x4
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs36, vs36, vs36
+  xxlxor  vs37, vs37, vs37
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+  xxlxor  vs44, vs44, vs44
+  xxlxor  vs45, vs45, vs45
+.endm
+
+
+.macro LOAD2x4   
+  LOAD2x4O 0,0 
+.endm
+
+
+.macro LOAD2x4O  OffsetA,OffsetB
+  lxv vs24, (\OffsetB+0)(BO)
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  xxperm    vs26, vs24,   permute_mask  
+  xxpermdi  vs25, vs24, vs24,2     
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x4_NORMAL
+  END2x4 AO,BO,32,16
+.endm
+
+
+.macro END2x4_WITHOUT_ADD
+  END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+.endm
+
+
+.macro LOAD2x4_2
+    LOAD2x4_2O 0,0
+.endm
+ 
+
+.macro LOAD2x4_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetB)(BO)
+  lxv vs24, (16+\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs26, vs24, permute_mask
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs25, vs24, vs24,2     
+  lxv vs0,  (32+\OffsetA)(AO)
+  lxv vs1,  (32+16+\OffsetA)(AO) 
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x4_2   
+  /*for load2 offset will be 64 and 32*/
+   KERNEL2x4_2  AO,BO,  64,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x4_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetB)(\BREG)
+.endif  
+  xvmaddasp   vs36, vs4,vs9
+  xvmaddasp   vs37, vs5,vs9
+  xvmaddasp   vs44, vs4,vs11
+  xvmaddasp   vs45, vs5,vs11
+.if \Complete==0
+  xxperm    vs10, vs8,    permute_mask 
+  xxpermdi  vs9,  vs8,   vs8,2   
+.endif    
+.if \Complete==0  
+   lxv  vs4,  DISP8(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2 
+.endif  
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG)
+.endif   
+  xvmaddasp   vs36, vs0,vs25
+  xvmaddasp   vs37, vs1,vs25
+  xvmaddasp   vs44, vs0,vs27
+  xvmaddasp   vs45, vs1,vs27
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxpermdi  vs25, vs24, vs24,2 
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP8(\Index,32+\OffsetA)(\AREG)
+  lxv vs1,  DISP8(\Index,32+16+\OffsetA)(\AREG) 
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+  addi    \AREG, \AREG, DISP8(\Index,64)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x4
+  LOAD2x4
+  END2x4  AO, BO, 32,16
+.endm
+
+
+.macro SAVE2x4
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T1)
+  lxv vs27 , 16(T1)
+#endif  
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  xxperm  vs9,vs37,permute_mask
+  xxperm  vs13,vs45,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12
+  AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART1    vs37,vs45,vs10,vs11
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs37,vs45,vs10,vs11
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs2,vs3, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+  xxperm  vs10,vs11, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,2
+  xxpermdi vs3,vs10,vs2,2 
+  xxpermdi vs9,vs0,vs8,2
+  xxpermdi vs11,vs2,vs10,2  
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs25,vs25,vs3 
+  xvaddsp vs26,vs26,vs9
+  xvaddsp vs27,vs27,vs11 
+#else
+  xxpermdi vs24,vs8,vs0,2
+  xxpermdi vs25,vs10,vs2,2
+  xxpermdi vs26,vs0,vs8,2
+  xxpermdi vs27,vs2,vs10,2  
+#endif
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO)
+  stxv vs26 , 0(T1)
+  stxv vs27 , 16(T1)
+  addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=2 and M=2
+**********************************************************************************************/
+
+.macro Zero2x2
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs36, vs36, vs36
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs44, vs44, vs44
+.endm
+
+
+.macro LOAD2x2   
+  LOAD2x2O 0,0 
+.endm
+
+
+.macro LOAD2x2O  OffsetA,OffsetB
+  lxv vs24, (\OffsetA+0)(AO)
+  lxv vs0,  (\OffsetB+0)(BO)
+  xxperm    vs26, vs24,   permute_mask  
+  xxpermdi  vs25, vs24, vs24,2      
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x2_NORMAL
+  END2x2 AO,BO,16,16
+.endm
+
+
+.macro END2x2_WITHOUT_ADD
+  END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs44, vs0,vs27
+.endm
+
+
+.macro LOAD2x2_2
+    LOAD2x2_2O 0,0
+.endm
+ 
+
+.macro LOAD2x2_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetA)(AO) 
+  lxv vs24, (16+\OffsetA)(AO) 
+  lxv vs4,  (0+\OffsetB)(BO)
+  lxv vs0,  (16+\OffsetB)(BO)
+  xxperm    vs10, vs8,    permute_mask
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxperm    vs26, vs24, permute_mask
+  xxpermdi  vs25, vs24, vs24,2    
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs27, vs26, vs26,2  
+.endm
+
+
+.macro END2x2_2   
+  /*for load2 offset will be 32 and 32*/
+   KERNEL2x2_2  AO,BO,  32,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x2_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+  lxv vs8,  DISP4(\Index,\OffsetA)(\AREG) 
+.endif  
+  xvmaddasp   vs36, vs4,vs9
+  xvmaddasp   vs44, vs4,vs11
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask 
+  xxpermdi  vs9,  vs8,   vs8,2  
+.endif    
+.if \Complete==0  
+   lxv  vs4,  DISP4(\Index,0+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2   
+.endif  
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs40, vs0,vs26
+.if \Complete==0
+  lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) 
+.endif   
+  xvmaddasp   vs36, vs0,vs25
+  xvmaddasp   vs44, vs0,vs27
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask 
+  xxpermdi  vs25, vs24, vs24,2    
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2    
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \AREG, \AREG, DISP4(\Index,\OffsetA) 
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP4(\Index,32)  
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x2
+  LOAD2x2
+  END2x2  AO, BO, 16,16
+.endm
+
+
+.macro SAVE2x2
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxv vs26 , 0(T1) 
+#endif  
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs8,vs36,permute_mask
+  xxperm  vs12,vs44,permute_mask
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1
+  MULT_APLHA_PART1    vs36,vs44,vs8,vs9
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1   
+  MULT_APLHA_PART2    vs36,vs44,vs8,vs9
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1
+  xxperm  vs8,vs9, save_permute_1
+#ifndef TRMMKERNEL
+  /* add */
+  xxpermdi vs1,vs8,vs0,0
+  xxpermdi vs9,vs0,vs8,3 
+  xvaddsp vs24,vs24,vs1
+  xvaddsp vs26,vs26,vs9 
+#else
+  xxpermdi vs24,vs8,vs0,0
+  xxpermdi vs26,vs0,vs8,3 
+#endif
+  stxv vs24 , 0(CO) 
+  stxv vs26 , 0(T1)
+  addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=2 and M=1
+**********************************************************************************************/
+
+.macro Zero2x1
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs40, vs40, vs40
+.endm
+
+
+.macro LOAD2x1   
+  LOAD2x1O 0,0 
+.endm
+
+
+.macro LOAD2x1O  OffsetA,OffsetB
+  lxsd v4, (\OffsetA+0)(AO) 
+  lxv vs0,  (\OffsetB+0)(BO)
+  xxspltd  vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask   
+.endm
+
+
+.macro END2x1_NORMAL
+  END2x1 AO,BO,8,16
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+  END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs40, vs0,vs26
+.endm
+
+
+.macro LOAD2x1_2
+    LOAD2x1_2O 0,0
+.endm
+ 
+
+.macro LOAD2x1_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetA)(AO) 
+  lxv vs4,  (0+\OffsetB)(BO)
+  lxv vs0,  (16+\OffsetB)(BO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0  
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask      
+.endm
+
+
+.macro END2x1_2   
+  /*for load2 offset will be 16 and 32*/
+   KERNEL2x1_2  AO,BO,  16,32,0 ,1,1 
+.endm
+
+
+.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL2x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetA)(\AREG) 
+  xxspltd  vs8,vs27,1 
+.endif  
+.if \Complete==0  
+   lxv  vs4,  DISP4(\Index,0+\OffsetB)(\BREG)
+.endif
+
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask  
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs40, vs0,vs26
+.if \Complete==0 
+  xxspltd  vs24,vs27,0  
+  xxperm   vs26, vs24, permute_mask   
+.endif  
+.if \Complete==0
+  lxv vs0,  DISP4(\Index,16+\OffsetB)(\BREG)
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \AREG, \AREG, DISP2(\Index,\OffsetA) 
+  addi    \BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+  addi    \AREG, \AREG, DISP2(\Index,16)  
+  addi    \BREG, \BREG,  DISP4(\Index,32)
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL2x1
+  LOAD2x1
+  END2x1  AO, BO, 8,16
+.endm
+
+
+.macro SAVE2x1
+  add T1, CO ,LDC  
+#ifndef TRMMKERNEL  
+  lxsd v4 , 0(CO) 
+#endif
+#ifndef TRMMKERNEL  
+  lxsd v5 , 0(T1) 
+#endif  
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1      
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, save_permute_1 
+#ifndef TRMMKERNEL
+  /* add */
+  xxspltd vs1,vs0,0
+  xxspltd vs3,vs0,1
+ /*--v4==vs36 v5==vs37---*/
+  xvaddsp vs36,vs36,vs1
+  xvaddsp vs37,vs37,vs3  
+#else 
+ /*--v4==vs36 v5==vs37---*/
+  xxspltd vs36,vs0,0
+  xxspltd vs37,vs0,1
+#endif
+  stxsd v4 , 0(CO) 
+  stxsd v5 , 0(T1) 
+  addi  CO, CO, 8
+.endm
+
+/*                                             macros for N=1 and M=8
+**********************************************************************************************/
+
+.macro Zero1x8
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs34, vs34, vs34
+  xxlxor  vs35, vs35, vs35
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+  xxlxor  vs42, vs42, vs42
+  xxlxor  vs43, vs43, vs43
+.endm
+
+
+.macro LOAD1x8   
+  LOAD1x8O 0,0 
+.endm
+
+
+.macro LOAD1x8O  OffsetA,OffsetB
+  lxsd vs4, (\OffsetB+0)(BO) 
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  lxv vs2,  (\OffsetA+32)(AO)
+  lxv vs3,  (\OffsetA+48)(AO) 
+  xxspltd   vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask    
+.endm
+
+
+.macro END1x8_NORMAL
+  END1x8 AO,BO,64,8
+.endm
+
+
+.macro END1x8_WITHOUT_ADD
+  END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+.endm
+
+
+.macro LOAD1x8_2
+    LOAD1x8_2O 0,0
+.endm
+ 
+
+.macro LOAD1x8_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0    
+  lxv vs6,  (32+\OffsetA)(AO)
+  lxv vs7,  (48+\OffsetA)(AO) 
+  lxv vs0,  (64+\OffsetA)(AO)
+  lxv vs1,  (64+16+\OffsetA)(AO)     
+  lxv vs2,  (64+32+\OffsetA)(AO)
+  lxv vs3,  (64+48+\OffsetA)(AO)
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask   
+.endm
+ 
+
+.macro END1x8_2   
+  /*for load2 offset will be 128 and 16*/
+   KERNEL1x8_2  AO,BO,  128,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x8_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+   lxv  vs4,  DISP16(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP16(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp   vs34, vs6,vs8 
+  xvmaddasp   vs35, vs7,vs8
+  xvmaddasp   vs42, vs6,vs10
+  xvmaddasp   vs43, vs7,vs10
+.if \Complete==0
+   lxv  vs6,  DISP16(\Index,32+\OffsetA)(\AREG)
+   lxv  vs7,  DISP16(\Index,48+\OffsetA)(\AREG) 
+.endif 
+.if \Complete==0 
+  xxspltd  vs8,vs27,1    
+  xxperm    vs10, vs8,    permute_mask   
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0
+  lxv vs0,  DISP16(\Index,64+\OffsetA)(\AREG)
+  lxv vs1,  DISP16(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp   vs34, vs2,vs24
+  xvmaddasp   vs35, vs3,vs24    
+  xvmaddasp   vs42, vs2,vs26
+  xvmaddasp   vs43, vs3,vs26
+.if \Complete==0
+  xxspltd  vs24,vs27,0   
+  xxperm    vs26, vs24, permute_mask  
+.endif  
+.if \Complete==0
+  lxv vs2,  DISP16(\Index,64+32+\OffsetA)(\AREG)
+  lxv vs3,  DISP16(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP16(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP16(\Index,128)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x8
+  LOAD1x8
+  END1x8  AO, BO, 64,8
+.endm
+
+
+.macro SAVE1x8
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+#ifndef TRMMKERNEL  
+  lxv vs26 , 32(CO)
+  lxv vs27 , 48(CO)
+#endif  
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  xxperm  vs2,vs34,permute_mask
+  xxperm  vs6,vs42,permute_mask
+  xxperm  vs3,vs35,permute_mask
+  xxperm  vs7,vs43,permute_mask 
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6
+  AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART1    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART1    vs35,vs43,vs6,vs7  
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+  MULT_APLHA_PART2    vs34,vs42,vs4,vs5
+  MULT_APLHA_PART2    vs35,vs43,vs6,vs7  
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, vs28
+  xxperm  vs2,vs3, vs28
+  xxperm  vs4,vs5, vs28
+  xxperm  vs6,vs7, vs28  
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs24,vs24,vs0
+  xvaddsp vs25,vs25,vs2
+  xvaddsp vs26,vs26,vs4
+  xvaddsp  vs27,vs27,vs6
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO) 
+  stxv vs26 , 32(CO)
+  stxv vs27 , 48(CO)    
+#else
+/* reconstruct r,i pairs*/
+  stxv vs0 , 0(CO)
+  stxv vs2 , 16(CO) 
+  stxv vs4 , 32(CO)
+  stxv vs6 , 48(CO)  
+#endif
+  addi  CO, CO, 64
+.endm
+
+/*                                             macros for N=1 and M=4
+**********************************************************************************************/
+
+.macro Zero1x4
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs33, vs33, vs33
+  xxlxor  vs40, vs40, vs40
+  xxlxor  vs41, vs41, vs41
+.endm
+
+
+.macro LOAD1x4   
+  LOAD1x4O 0,0 
+.endm
+
+
+.macro LOAD1x4O  OffsetA,OffsetB
+  lxsd vs4, (\OffsetB+0)(BO) 
+  lxv vs0,  (\OffsetA+0)(AO)
+  lxv vs1,  (\OffsetA+16)(AO)
+  xxspltd   vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask    
+.endm
+
+
+.macro END1x4_NORMAL
+  END1x4 AO,BO,32,8
+.endm
+
+
+.macro END1x4_WITHOUT_ADD
+  END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+.endm
+
+
+.macro LOAD1x4_2
+    LOAD1x4_2O 0,0
+.endm
+ 
+
+.macro LOAD1x4_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs5,  (16+\OffsetA)(AO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0    
+  lxv vs0,  (32+\OffsetA)(AO)
+  lxv vs1,  (32+16+\OffsetA)(AO)     
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask   
+.endm
+ 
+
+.macro END1x4_2   
+  /*for load2 offset will be 64 and 16*/
+   KERNEL1x4_2  AO,BO,  64,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x4_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs33, vs5,vs8
+  xvmaddasp   vs40, vs4,vs10
+  xvmaddasp   vs41, vs5,vs10
+.if \Complete==0  
+   lxv  vs4,  DISP8(\Index,0+\OffsetA)(\AREG)
+   lxv  vs5,  DISP8(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0 
+  xxspltd  vs8,vs27,1    
+  xxperm    vs10, vs8,    permute_mask   
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs33, vs1,vs24
+  xvmaddasp   vs40, vs0,vs26
+  xvmaddasp   vs41, vs1,vs26
+.if \Complete==0
+  lxv vs0,  DISP8(\Index,32+\OffsetA)(\AREG)
+  lxv vs1,  DISP8(\Index,32+16+\OffsetA)(\AREG) 
+.endif
+
+.if \Complete==0
+  xxspltd  vs24,vs27,0   
+  xxperm    vs26, vs24, permute_mask  
+.endif  
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP8(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP8(\Index,64)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x4
+  LOAD1x4
+  END1x4  AO, BO, 32,8
+.endm
+
+
+.macro SAVE1x4
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+  lxv vs25 , 16(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  xxperm  vs1,vs33,permute_mask
+  xxperm  vs5,vs41,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART1    vs33,vs41,vs2,vs3    
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+  MULT_APLHA_PART2    vs33,vs41,vs2,vs3   
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, vs28
+  xxperm  vs2,vs3, vs28
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs24,vs24,vs0
+  xvaddsp vs25,vs25,vs2
+  stxv vs24 , 0(CO)
+  stxv vs25 , 16(CO) 
+#else
+/* reconstruct r,i pairs*/
+  stxv vs0 , 0(CO)
+  stxv vs2 , 16(CO) 
+#endif
+  addi  CO, CO, 32
+.endm
+
+/*                                             macros for N=1 and M=2
+**********************************************************************************************/
+
+.macro Zero1x2
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs40, vs40, vs40
+.endm
+
+
+.macro LOAD1x2   
+  LOAD1x2O 0,0 
+.endm
+
+
+.macro LOAD1x2O  OffsetA,OffsetB
+  lxsd vs4, (\OffsetB+0)(BO) 
+  lxv vs0,  (\OffsetA+0)(AO)
+  xxspltd   vs24,vs36,0
+  xxperm    vs26, vs24,   permute_mask    
+.endm
+
+
+.macro END1x2_NORMAL
+  END1x2 AO,BO,16,8
+.endm
+
+
+.macro END1x2_WITHOUT_ADD
+  END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs40, vs0,vs26
+.endm
+
+
+.macro LOAD1x2_2
+    LOAD1x2_2O 0,0
+.endm
+ 
+
+.macro LOAD1x2_2O  OffsetA,OffsetB
+  lxv vs27,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO)
+  lxv vs0,  (16+\OffsetA)(AO)
+  xxspltd  vs8,vs27,1
+  xxspltd  vs24,vs27,0    
+  xxperm    vs10, vs8,    permute_mask 
+  xxperm    vs26, vs24, permute_mask   
+.endm
+ 
+
+.macro END1x2_2   
+  /*for load2 offset will be 32 and 16*/
+   KERNEL1x2_2  AO,BO,  32,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x2_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x2_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+.if \Complete==0  
+  lxv vs27,  DISP2(\Index,\OffsetB)(\BREG)
+.endif    
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+   lxv  vs4,  DISP4(\Index,0+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0 
+  xxspltd  vs8,vs27,1    
+  xxperm    vs10, vs8,    permute_mask   
+.endif    
+  xvmaddasp   vs32, vs0,vs24
+  xvmaddasp   vs40, vs0,vs26
+.if \Complete==0
+  lxv vs0,  DISP4(\Index,16+\OffsetA)(\AREG)
+.endif
+
+.if \Complete==0
+  xxspltd  vs24,vs27,0   
+  xxperm    vs26, vs24, permute_mask  
+.endif  
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP4(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP4(\Index,32)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x2
+  LOAD1x2
+  END1x2  AO, BO, 16,8
+.endm
+
+
+.macro SAVE1x2
+#ifndef TRMMKERNEL  
+  lxv vs24 , 0(CO)
+#endif
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs0,vs1 
+  MULT_APLHA_PART2    vs32,vs40,vs0,vs1    
+/* reconstruct r,i pairs*/
+  xxperm  vs0,vs1, vs28
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs24,vs24,vs0
+  stxv vs24 , 0(CO)
+#else
+/* reconstruct r,i pairs*/
+  stxv vs0 , 0(CO)
+#endif
+  addi  CO, CO, 16
+.endm
+
+/*                                             macros for N=1 and M=1
+**********************************************************************************************/
+.macro Zero1x1
+  xxlxor  vs32, vs32, vs32
+  xxlxor  vs40, vs40, vs40
+.endm
+
+
+.macro LOAD1x1   
+  LOAD1x1O 0,0 
+.endm
+
+
+.macro LOAD1x1O  OffsetA,OffsetB
+  lxsd v4, (\OffsetB+0)(BO) 
+  lxsd v5,  (\OffsetA+0)(AO)
+  xxperm    vs38, vs36,   permute_mask    
+.endm
+
+
+.macro END1x1_NORMAL
+  END1x1 AO,BO,8,8
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+  END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1 AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+  addi  \BREG, \BREG, \OffsetB
+.endif
+
+.if \OffsetA != 0
+  addi  \AREG, \AREG, \OffsetA
+.endif
+
+    xvmaddasp       vs32, vs37,vs36
+    xvmaddasp       vs40, vs37,vs38
+.endm
+
+
+.macro LOAD1x1_2
+    LOAD1x1_2O 0,0
+.endm
+ 
+
+.macro LOAD1x1_2O  OffsetA,OffsetB
+  lxv vs8,  (\OffsetB)(BO)
+  lxv vs4,  (0+\OffsetA)(AO) 
+  xxperm    vs10, vs8,    permute_mask  
+.endm
+ 
+
+.macro END1x1_2   
+  /*for load2 offset will be 16 and 16*/
+   KERNEL1x1_2  AO,BO,  16,16,0 ,1,1 
+.endm
+
+
+.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
+  KERNEL1x1_2 AO,BO,  \OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x1_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+ 
+  xvmaddasp   vs32, vs4,vs8
+  xvmaddasp   vs40, vs4,vs10
+.if \Complete==0  
+  lxv vs8,  DISP2(\Index,\OffsetB)(\BREG)
+  lxv vs4,  DISP2(\Index,\OffsetB)(\AREG)
+  xxperm    vs10, vs8,    permute_mask  
+.endif
+
+.if \IsLast==1  
+.if \Complete==1
+  addi    \BREG, \BREG,  DISP2(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP2(\Index,\OffsetA)  
+.else
+  addi    \BREG, \BREG,  DISP2(\Index,16)
+  addi    \AREG, \AREG, DISP2(\Index,16)  
+.endif
+
+.endif   
+.endm
+
+
+.macro KERNEL1x1
+  LOAD1x1
+  END1x1  AO, BO, 8,8
+.endm
+
+
+.macro SAVE1x1
+#ifndef TRMMKERNEL  
+  lxsd v4 , 0(CO)
+#endif
+  /*aggregate x2*/
+  xxpermdi vs33,vs32,vs32,2
+  xxpermdi vs41,vs40,vs40,2 
+  xvaddsp vs32,vs32,vs33
+  xvaddsp vs40,vs40,vs41
+
+  xxperm  vs0,vs32,permute_mask
+  xxperm  vs4,vs40,permute_mask
+  AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4
+  /*inner reverse save_permute and store vs28 */
+  xxpermdi vs28,save_permute_1,save_permute_1,2
+  /*VSINRR,VSINII,VSOUT1,VSOUT2*/
+  MULT_APLHA_PART1    vs32,vs40,vs37,vs1 
+  MULT_APLHA_PART2    vs32,vs40,vs37,vs1    
+
+/* reconstruct r,i pairs*/
+  xxperm  vs37,vs1, vs28  
+
+#ifndef TRMMKERNEL
+  /* add */
+  xvaddsp vs36,vs36,vs37
+  stxsd v4 , 0(CO)
+#else
+
+/* vs37 is v5 */
+  stxsd v5 , 0(CO)
+#endif
+  addi  CO, CO, 8
+.endm
+
+ 
+ 
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	7			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	6			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	5			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	4			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	3			 
+		.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*8;
+// 		ptrbb = bb + off*4;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+8;	// number of values in A
+// #else
+// 		temp = off+4;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 8; // number of values in A
+// #else
+// 		temp -= 4; // number of values in B
+// #endif
+// 		ptrba += temp*8;
+// 		ptrbb += temp*4;
+// #endif
+
+// #ifdef LEFT
+// 		off += 8; // number of values in A
+// #endif
+*/
+ 
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+
+    #endif
+
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
 .endm
\ No newline at end of file
diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c
index 8663039c5..575847da2 100644
--- a/kernel/power/cgemv_n.c
+++ b/kernel/power/cgemv_n.c
@@ -1,597 +1,597 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#if !defined(__VEC__) || !defined(__ALTIVEC__)
-#include "../arm/zgemv_n.c"
-#else
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "common.h" 
-#include <altivec.h>   
-#define NBMAX 1024
-
-
-static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
-
- 
-static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
-  
- FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
-    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
-    register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
-    register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
-    register __vector float vx2_r = {x[4], x[4],x[4], x[4]};
-    register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]};
-    register __vector float vx3_r = {x[6], x[6],x[6], x[6]};
-    register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]};
-#else
-    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
-    register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
-    register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
-    register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
-    register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]};
-    register __vector float vx2_i = {x[5], x[5],x[5], x[5]};
-    register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
-    register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
-#endif
-    register __vector float *vptr_y = (__vector float *) y;
-    register __vector float *vptr_a0 = (__vector float *) a0;
-    register __vector float *vptr_a1 = (__vector float *) a1;
-    register __vector float *vptr_a2 = (__vector float *) a2;
-    register __vector float *vptr_a3 = (__vector float *) a3; 
-    BLASLONG  i = 0; 
-    BLASLONG i2=16;
-    for (;i< n * 8; i+=32,i2+=32) {
-        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
-        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
-        register __vector float va1   = vec_vsx_ld(i, vptr_a1);
-        register __vector float va2   = vec_vsx_ld(i ,vptr_a2);
-        register __vector float va3   = vec_vsx_ld(i ,vptr_a3);
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
-        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
-        register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
-        register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
-
-        vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
-        vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
-        va0   = vec_perm(va0, va0,swap_mask);
-        va0_1 = vec_perm(va0_1, va0_1,swap_mask);
-        va1   = vec_perm(va1, va1,swap_mask);
-        va1_1 = vec_perm(va1_1, va1_1,swap_mask);
-        va2   = vec_perm(va2, va2,swap_mask);
-        va2_1 = vec_perm(va2_1, va2_1,swap_mask);
-        va3   = vec_perm(va3, va3,swap_mask);
-        va3_1 = vec_perm(va3_1, va3_1,swap_mask);
-        vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
-        vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
-
-        vec_vsx_st(vy_0 ,i,  vptr_y);
-        vec_vsx_st(vy_1,i2,vptr_y);
-    }
-
-}	
- 
-
-
-static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
- 
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda; 
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
-    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
-    register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
-    register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; 
-#else
-    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
-    register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
-    register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
-    register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; 
-#endif
-    register __vector float *vptr_y = (__vector float *) y;
-    register __vector float *vptr_a0 = (__vector float *) a0;
-    register __vector float *vptr_a1 = (__vector float *) a1; 
-    BLASLONG  i = 0;
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
-        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
-        register __vector float va1   = vec_vsx_ld(i, vptr_a1); 
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
-        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); 
-
-        register __vector float va0x   = vec_perm(va0, va0,swap_mask);
-        register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
-        register __vector float va1x   = vec_perm(va1, va1,swap_mask);
-        register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);
-        vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
-        vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; 
-
-        vec_vsx_st(vy_0 ,i,  vptr_y);
-        vec_vsx_st(vy_1,i2,vptr_y);
-    }
-
-}
-
- 
-
-static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
-
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
-    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; 
-#else
-    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
-    register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; 
-#endif
-    register __vector float *vptr_y = (__vector float *) y;
-    register __vector float *vptr_a0 = (__vector float *) ap; 
-    BLASLONG  i = 0; 
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
-        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0); 
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); 
-
-        register __vector float va0x   = vec_perm(va0, va0,swap_mask);
-        register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
-        vy_0 += va0*vx0_r + va0x*vx0_i;
-        vy_1 += va0_1*vx0_r + va0x_1*vx0_i; 
-
-        vec_vsx_st(vy_0 ,i,  vptr_y);
-        vec_vsx_st(vy_1,i2,vptr_y);
-    }
-}
-
-
-
-
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i=0;
-
-
-    if (inc_dest != 2) {
- 		FLOAT temp_r;
-		FLOAT temp_i;
-		for ( i=0; i<n; i++ )
-		{
-#if !defined(XCONJ) 
-			temp_r = alpha_r * src[0] - alpha_i * src[1];
-			temp_i = alpha_r * src[1] + alpha_i * src[0];
-#else
-			temp_r =  alpha_r * src[0] + alpha_i * src[1];
-			temp_i = -alpha_r * src[1] + alpha_i * src[0];
-#endif
-
-			*dest += temp_r;
-			*(dest+1) += temp_i;
-
-			src+=2;
-			dest += inc_dest;
-		}
-        return;
-    } else {
-        __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-#if   !defined(XCONJ) 
-
-        register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r};
-        register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i};
-
-#else
-        register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r};
-        register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i};
-#endif
-
-        register __vector float *vptr_src = (__vector float *) src;
-        register __vector float *vptr_y = (__vector float *) dest; 
-
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
-        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
-
-
-        register __vector float vsrc  = vec_vsx_ld(i,vptr_src);
-        register __vector float vsrc_1  = vec_vsx_ld(i2,vptr_src);
-
-        register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
-        register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
-
-        vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
-        vy_1 += vsrc_1*valpha_r +  vsrcx_1*valpha_i;  
-
-        vec_vsx_st(vy_0 ,i,  vptr_y);
-        vec_vsx_st(vy_1,i2,vptr_y);
-
-        }
- 
-    }
-    return;
-}
-
-
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
-    BLASLONG i=0;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2;
-    FLOAT xbuffer[8] __attribute__((aligned(16)));
-    FLOAT *ybuffer;
-
-    if (m < 1) return (0);
-    if (n < 1) return (0);
-
-    ybuffer = buffer;
-
-    inc_x *= 2;
-    inc_y *= 2;
-    lda *= 2;
-
-    n1 = n / 4;
-    n2 = n % 4;
-
-    m3 = m % 4;
-    m1 = m - (m % 4);
-    m2 = (m % NBMAX) - (m % 4);
-
-    y_ptr = y;
-
-    BLASLONG NB = NBMAX;
-
-    while (NB == NBMAX) {
-
-        m1 -= NB;
-        if (m1 < 0) {
-            if (m2 == 0) break;
-            NB = m2;
-        }
-
-        a_ptr = a;
-
-        x_ptr = x; 
-
-        memset(ybuffer, 0, NB * 2*sizeof(FLOAT));  
-
-        if (inc_x == 2) {
-
-            for (i = 0; i < n1; i++) {
-                cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
-
-                a_ptr += lda << 2;
-                x_ptr += 8;
-            }
-
-            if (n2 & 2) {
-                cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
-                x_ptr += 4;
-                a_ptr += 2 * lda;
-
-            }
-
-            if (n2 & 1) {
-                cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
-                x_ptr += 2;
-                a_ptr += lda;
-
-            }
-        } else {
-
-            for (i = 0; i < n1; i++) {
-
-                xbuffer[0] = x_ptr[0];
-                xbuffer[1] = x_ptr[1];
-                x_ptr += inc_x;
-                xbuffer[2] = x_ptr[0];
-                xbuffer[3] = x_ptr[1];
-                x_ptr += inc_x;
-                xbuffer[4] = x_ptr[0];
-                xbuffer[5] = x_ptr[1];
-                x_ptr += inc_x;
-                xbuffer[6] = x_ptr[0];
-                xbuffer[7] = x_ptr[1];
-                x_ptr += inc_x;
-
-                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
-
-                a_ptr += lda << 2;
-            }
-
-            for (i = 0; i < n2; i++) {
-                xbuffer[0] = x_ptr[0];
-                xbuffer[1] = x_ptr[1];
-                x_ptr += inc_x;
-                cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
-                a_ptr += lda;
-
-            }
-
-        }
-
-        add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
-        a += 2 * NB;
-        y_ptr += NB * inc_y;
-    }
-
-    if (m3 == 0) return (0);
-
-    if (m3 == 1) {
-        a_ptr = a;
-        x_ptr = x;
-        FLOAT temp_r = 0.0;
-        FLOAT temp_i = 0.0;
-
-        if (lda == 2 && inc_x == 2) {
-
-            for (i = 0; i < (n & -2); i += 2) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
-                temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
-#else
-                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
-                temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
-#endif
-
-                a_ptr += 4;
-                x_ptr += 4;
-            }
-
-            for (; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-#else
-                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-#endif
-
-                a_ptr += 2;
-                x_ptr += 2;
-            }
-
-        } else {
-
-            for (i = 0; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-#else
-                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-#endif
-
-                a_ptr += lda;
-                x_ptr += inc_x;
-            }
-
-        }
-#if !defined(XCONJ) 
-        y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-        y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
-        y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-        y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
-        return (0);
-    }
-
-    if (m3 == 2) {
-        a_ptr = a;
-        x_ptr = x;
-        FLOAT temp_r0 = 0.0;
-        FLOAT temp_i0 = 0.0;
-        FLOAT temp_r1 = 0.0;
-        FLOAT temp_i1 = 0.0;
-
-        if (lda == 4 && inc_x == 2) {
-
-            for (i = 0; i < (n & -2); i += 2) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-
-                temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
-                temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
-                temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
-                temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
-#else
-                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-
-                temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
-                temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
-                temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
-                temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
-#endif
-
-                a_ptr += 8;
-                x_ptr += 4;
-            }
-
-            for (; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-#else
-                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-#endif
-
-                a_ptr += 4;
-                x_ptr += 2;
-            }
-
-        } else {
-
-            for (i = 0; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-#else
-                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-#endif
-
-                a_ptr += lda;
-                x_ptr += inc_x;
-            }
-
-        }
-#if !defined(XCONJ) 
-        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
-#else
-        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-#endif
-        return (0);
-    }
-
-    if (m3 == 3) {
-        a_ptr = a;
-        x_ptr = x;
-        FLOAT temp_r0 = 0.0;
-        FLOAT temp_i0 = 0.0;
-        FLOAT temp_r1 = 0.0;
-        FLOAT temp_i1 = 0.0;
-        FLOAT temp_r2 = 0.0;
-        FLOAT temp_i2 = 0.0;
-
-        if (lda == 6 && inc_x == 2) {
-
-            for (i = 0; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
-                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
-#else
-                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
-#endif
-
-                a_ptr += 6;
-                x_ptr += 2;
-            }
-
-        } else {
-
-            for (i = 0; i < n; i++) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
-                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
-                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
-#else
-                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
-                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
-                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
-                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
-#endif
-
-                a_ptr += lda;
-                x_ptr += inc_x;
-            }
-
-        }
-#if !defined(XCONJ) 
-        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
-        y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
-#else
-        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-        y_ptr += inc_y;
-        y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
-        y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
-#endif
-        return (0);
-    }
-
-    return (0);
-}
-#endif
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/zgemv_n.c"
+#else
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "common.h" 
+#include <altivec.h>   
+#define NBMAX 1024
+
+
+static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+
+ 
+static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+  
+ FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
+    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
+    register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
+    register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]};
+    register __vector float vx2_r = {x[4], x[4],x[4], x[4]};
+    register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]};
+    register __vector float vx3_r = {x[6], x[6],x[6], x[6]};
+    register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]};
+#else
+    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
+    register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
+    register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
+    register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
+    register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]};
+    register __vector float vx2_i = {x[5], x[5],x[5], x[5]};
+    register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
+    register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
+#endif
+    register __vector float *vptr_y = (__vector float *) y;
+    register __vector float *vptr_a0 = (__vector float *) a0;
+    register __vector float *vptr_a1 = (__vector float *) a1;
+    register __vector float *vptr_a2 = (__vector float *) a2;
+    register __vector float *vptr_a3 = (__vector float *) a3; 
+    BLASLONG  i = 0; 
+    BLASLONG i2=16;
+    for (;i< n * 8; i+=32,i2+=32) {
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1);
+        register __vector float va2   = vec_vsx_ld(i ,vptr_a2);
+        register __vector float va3   = vec_vsx_ld(i ,vptr_a3);
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
+        register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
+        register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
+
+        vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
+        vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
+        va0   = vec_perm(va0, va0,swap_mask);
+        va0_1 = vec_perm(va0_1, va0_1,swap_mask);
+        va1   = vec_perm(va1, va1,swap_mask);
+        va1_1 = vec_perm(va1_1, va1_1,swap_mask);
+        va2   = vec_perm(va2, va2,swap_mask);
+        va2_1 = vec_perm(va2_1, va2_1,swap_mask);
+        va3   = vec_perm(va3, va3,swap_mask);
+        va3_1 = vec_perm(va3_1, va3_1,swap_mask);
+        vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
+        vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
+
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
+    }
+
+}	
+ 
+
+
+static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
+ 
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda; 
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
+    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]};
+    register __vector float vx1_r = {x[2], x[2],x[2], x[2]};
+    register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; 
+#else
+    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
+    register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
+    register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
+    register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; 
+#endif
+    register __vector float *vptr_y = (__vector float *) y;
+    register __vector float *vptr_a0 = (__vector float *) a0;
+    register __vector float *vptr_a1 = (__vector float *) a1; 
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); 
+
+        register __vector float va0x   = vec_perm(va0, va0,swap_mask);
+        register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
+        register __vector float va1x   = vec_perm(va1, va1,swap_mask);
+        register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask);
+        vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
+        vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; 
+
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
+    }
+
+}
+
+ 
+
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
+
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+    register __vector float vx0_r = {x[0], x[0],x[0], x[0]};
+    register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; 
+#else
+    register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
+    register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; 
+#endif
+    register __vector float *vptr_y = (__vector float *) y;
+    register __vector float *vptr_a0 = (__vector float *) ap; 
+    BLASLONG  i = 0; 
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); 
+
+        register __vector float va0x   = vec_perm(va0, va0,swap_mask);
+        register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
+        vy_0 += va0*vx0_r + va0x*vx0_i;
+        vy_1 += va0_1*vx0_r + va0x_1*vx0_i; 
+
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
+    }
+}
+
+
+
+
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
+    BLASLONG i=0;
+
+
+    if (inc_dest != 2) {
+ 		FLOAT temp_r;
+		FLOAT temp_i;
+		for ( i=0; i<n; i++ )
+		{
+#if !defined(XCONJ) 
+			temp_r = alpha_r * src[0] - alpha_i * src[1];
+			temp_i = alpha_r * src[1] + alpha_i * src[0];
+#else
+			temp_r =  alpha_r * src[0] + alpha_i * src[1];
+			temp_i = -alpha_r * src[1] + alpha_i * src[0];
+#endif
+
+			*dest += temp_r;
+			*(dest+1) += temp_i;
+
+			src+=2;
+			dest += inc_dest;
+		}
+        return;
+    } else {
+        __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+#if   !defined(XCONJ) 
+
+        register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r};
+        register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i};
+
+#else
+        register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r};
+        register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i};
+#endif
+
+        register __vector float *vptr_src = (__vector float *) src;
+        register __vector float *vptr_y = (__vector float *) dest; 
+
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+
+
+        register __vector float vsrc  = vec_vsx_ld(i,vptr_src);
+        register __vector float vsrc_1  = vec_vsx_ld(i2,vptr_src);
+
+        register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
+        register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
+
+        vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
+        vy_1 += vsrc_1*valpha_r +  vsrcx_1*valpha_i;  
+
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
+
+        }
+ 
+    }
+    return;
+}
+
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
+    BLASLONG i=0;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+    FLOAT xbuffer[8] __attribute__((aligned(16)));
+    FLOAT *ybuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    ybuffer = buffer;
+
+    inc_x *= 2;
+    inc_y *= 2;
+    lda *= 2;
+
+    n1 = n / 4;
+    n2 = n % 4;
+
+    m3 = m % 4;
+    m1 = m - (m % 4);
+    m2 = (m % NBMAX) - (m % 4);
+
+    y_ptr = y;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        a_ptr = a;
+
+        x_ptr = x; 
+
+        memset(ybuffer, 0, NB * 2*sizeof(FLOAT));  
+
+        if (inc_x == 2) {
+
+            for (i = 0; i < n1; i++) {
+                cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
+
+                a_ptr += lda << 2;
+                x_ptr += 8;
+            }
+
+            if (n2 & 2) {
+                cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
+                x_ptr += 4;
+                a_ptr += 2 * lda;
+
+            }
+
+            if (n2 & 1) {
+                cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
+                x_ptr += 2;
+                a_ptr += lda;
+
+            }
+        } else {
+
+            for (i = 0; i < n1; i++) {
+
+                xbuffer[0] = x_ptr[0];
+                xbuffer[1] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[2] = x_ptr[0];
+                xbuffer[3] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[4] = x_ptr[0];
+                xbuffer[5] = x_ptr[1];
+                x_ptr += inc_x;
+                xbuffer[6] = x_ptr[0];
+                xbuffer[7] = x_ptr[1];
+                x_ptr += inc_x;
+
+                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
+
+                a_ptr += lda << 2;
+            }
+
+            for (i = 0; i < n2; i++) {
+                xbuffer[0] = x_ptr[0];
+                xbuffer[1] = x_ptr[1];
+                x_ptr += inc_x;
+                cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
+                a_ptr += lda;
+
+            }
+
+        }
+
+        add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
+        a += 2 * NB;
+        y_ptr += NB * inc_y;
+    }
+
+    if (m3 == 0) return (0);
+
+    if (m3 == 1) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r = 0.0;
+        FLOAT temp_i = 0.0;
+
+        if (lda == 2 && inc_x == 2) {
+
+            for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
+                temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
+                temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
+#endif
+
+                a_ptr += 4;
+                x_ptr += 4;
+            }
+
+            for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+                a_ptr += 2;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+#else
+                temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+        y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+        y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+        y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+        return (0);
+    }
+
+    if (m3 == 2) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r0 = 0.0;
+        FLOAT temp_i0 = 0.0;
+        FLOAT temp_r1 = 0.0;
+        FLOAT temp_i1 = 0.0;
+
+        if (lda == 4 && inc_x == 2) {
+
+            for (i = 0; i < (n & -2); i += 2) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+
+                temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
+                temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
+                temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
+                temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+
+                temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
+                temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
+                temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+                temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
+#endif
+
+                a_ptr += 8;
+                x_ptr += 4;
+            }
+
+            for (; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+                a_ptr += 4;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+        return (0);
+    }
+
+    if (m3 == 3) {
+        a_ptr = a;
+        x_ptr = x;
+        FLOAT temp_r0 = 0.0;
+        FLOAT temp_i0 = 0.0;
+        FLOAT temp_r1 = 0.0;
+        FLOAT temp_i1 = 0.0;
+        FLOAT temp_r2 = 0.0;
+        FLOAT temp_i2 = 0.0;
+
+        if (lda == 6 && inc_x == 2) {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+                a_ptr += 6;
+                x_ptr += 2;
+            }
+
+        } else {
+
+            for (i = 0; i < n; i++) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
+#else
+                temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
+                temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
+                temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+                temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
+                temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+                temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
+#endif
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+
+        }
+#if !defined(XCONJ) 
+        y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+        y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+        y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
+        y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+#else
+        y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+        y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+        y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+        y_ptr += inc_y;
+        y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
+        y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+#endif
+        return (0);
+    }
+
+    return (0);
+}
+#endif
diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c
index 1bfc235db..77493dc2f 100644
--- a/kernel/power/cgemv_t.c
+++ b/kernel/power/cgemv_t.c
@@ -1,601 +1,601 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#if !defined(__VEC__) || !defined(__ALTIVEC__)
-#include "../arm/zgemv_t.c"
-#else
-
-#include "common.h"
-
-#define NBMAX 1024 
-#include <altivec.h> 
-static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
-
-static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
-    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
-    __vector float* vptr_a0 = (__vector float*) a0;
-    __vector float* vptr_a1 = (__vector float*) a1;
-    __vector float* vptr_a2 = (__vector float*) a2;
-    __vector float* vptr_a3 = (__vector float*) a3;
-    __vector float* v_x = (__vector float*) x;
-
-    BLASLONG  i = 0;
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
-        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
-                
-        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
-        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
-
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
-        register __vector float va1   = vec_vsx_ld(i, vptr_a1);
-        register __vector float va2   = vec_vsx_ld(i ,vptr_a2);
-        register __vector float va3   = vec_vsx_ld(i ,vptr_a3);
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
-        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
-        register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
-        register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
-
-
-        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
-        vtemp0_r += vxr_0*va0 + vxr_1*va0_1; 
-        vtemp1_p += vx_0*va1 + vx_1*va1_1;
-        vtemp1_r += vxr_0*va1 + vxr_1*va1_1; 
-        vtemp2_p += vx_0*va2 + vx_1*va2_1;
-        vtemp2_r += vxr_0*va2 + vxr_1*va2_1; 
-        vtemp3_p += vx_0*va3 + vx_1*va3_1;
-        vtemp3_r += vxr_0*va3 + vxr_1*va3_1; 
-
-    }
-
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
-
-    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
-    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
-
-    register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3];
-    register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3];
-
-    register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3];
-    register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3];
-
-#else
-    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
-
-    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
-    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
-
-    register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3];
-    register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3];
-
-    register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3];
-    register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3];
-
-#endif    
-
-#if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
-    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
-    y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
-    y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
-    y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
-    y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
-
-#else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
-    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-    y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
-    y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
-    y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
-    y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
-
-#endif
-
-}
- 
-
-static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda; 
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
-    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; 
-
-
-    __vector float* vptr_a0 = (__vector float*) a0;
-    __vector float* vptr_a1 = (__vector float*) a1; 
-    __vector float* v_x = (__vector float*) x;
-
-    BLASLONG  i = 0;
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
-        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
-                
-        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
-        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
-
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
-        register __vector float va1   = vec_vsx_ld(i, vptr_a1); 
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
-        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); 
-
-
-        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
-        vtemp0_r += vxr_0*va0 + vxr_1*va0_1; 
-        vtemp1_p += vx_0*va1 + vx_1*va1_1;
-        vtemp1_r += vxr_0*va1 + vxr_1*va1_1;  
-
-    }
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
-
-    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
-    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
- 
-
-#else
-    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
-
-    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
-    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; 
-
-#endif    
-
-#if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
-    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; 
-
-#else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
-    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
-    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; 
-
-#endif
-  
-}
- 
-
-static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
- 
-    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
-    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
-    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; 
-    __vector float* vptr_a0 = (__vector float*) ap; 
-    __vector float* v_x = (__vector float*) x;
-    BLASLONG  i = 0;
-    BLASLONG  i2 = 16;  
-    for (;i< n * 8; i+=32, i2+=32) { 
-        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
-        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
-                
-        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
-        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
-
-        register __vector float va0   = vec_vsx_ld(i,vptr_a0); 
-        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);  
-
-        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
-        vtemp0_r += vxr_0*va0 + vxr_1*va0_1;  
-    }
-
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-
-    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; 
-
-#else
-    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
-    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; 
-
-#endif    
-
-#if !defined(XCONJ)
-
-    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
-    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; 
-
-#else
-
-    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
-    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; 
-
-#endif
-
-
-}
- 
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
-    BLASLONG i;
-    for (i = 0; i < n; i++) {
-        *dest = *src;
-        *(dest + 1) = *(src + 1);
-        dest += 2;
-        src += inc_src;
-    }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
-    BLASLONG i=0;
-    BLASLONG j=0;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2;
-    FLOAT ybuffer[8] __attribute__((aligned(16)));
-    FLOAT *xbuffer;
-
-    if (m < 1) return (0);
-    if (n < 1) return (0);
-
-    inc_x <<= 1;
-    inc_y <<= 1;
-    lda <<= 1;
-
-    xbuffer = buffer;
-
-    n1 = n >> 2;
-    n2 = n & 3;
-
-    m3 = m & 3;
-    m1 = m - m3;
-    m2 = (m & (NBMAX - 1)) - m3;
-
-    BLASLONG NB = NBMAX;
-
-    while (NB == NBMAX) {
-
-        m1 -= NB;
-        if (m1 < 0) {
-            if (m2 == 0) break;
-            NB = m2;
-        }
-
-        y_ptr = y;
-        a_ptr = a;
-        x_ptr = x;
-
-        if (inc_x != 2)
-            copy_x(NB, x_ptr, xbuffer, inc_x);
-        else
-            xbuffer = x_ptr;
-
-        if (inc_y == 2) {
-
-            for (i = 0; i < n1; i++) {
-                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
-                a_ptr += lda << 2;
-                y_ptr += 8;
-
-            }
-
-            if (n2 & 2) {
-                cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
-                a_ptr += lda << 1;
-                y_ptr += 4;
-
-            }
-
-            if (n2 & 1) {
-                cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
-                a_ptr += lda;
-                y_ptr += 2;
-
-            }
-
-        } else {
-
-            for (i = 0; i < n1; i++) {
-                memset(ybuffer, 0, sizeof (ybuffer));
-                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
-
-                a_ptr += lda << 2;
-
-                y_ptr[0] += ybuffer[0];
-                y_ptr[1] += ybuffer[1];
-                y_ptr += inc_y;
-                y_ptr[0] += ybuffer[2];
-                y_ptr[1] += ybuffer[3];
-                y_ptr += inc_y;
-                y_ptr[0] += ybuffer[4];
-                y_ptr[1] += ybuffer[5];
-                y_ptr += inc_y;
-                y_ptr[0] += ybuffer[6];
-                y_ptr[1] += ybuffer[7];
-                y_ptr += inc_y;
-
-            }
-
-            for (i = 0; i < n2; i++) {
-                memset(ybuffer, 0, sizeof (ybuffer));
-                cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
-                a_ptr += lda;
-                y_ptr[0] += ybuffer[0];
-                y_ptr[1] += ybuffer[1];
-                y_ptr += inc_y;
-
-            }
-
-        }
-        a += 2 * NB;
-        x += NB * inc_x;
-    }
-
-    if (m3 == 0) return (0);
-
-    x_ptr = x;
-    j = 0;
-    a_ptr = a;
-    y_ptr = y;
-
-    if (m3 == 3) {
-
-        FLOAT temp_r;
-        FLOAT temp_i;
-        FLOAT x0 = x_ptr[0];
-        FLOAT x1 = x_ptr[1];
-        x_ptr += inc_x;
-        FLOAT x2 = x_ptr[0];
-        FLOAT x3 = x_ptr[1];
-        x_ptr += inc_x;
-        FLOAT x4 = x_ptr[0];
-        FLOAT x5 = x_ptr[1];
-        while (j < n) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
-            temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
-            temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
-#else
-
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
-            temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
-            temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
-#endif
-
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
-
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j++;
-        }
-        return (0);
-    }
-
-    if (m3 == 2) {
-
-        FLOAT temp_r;
-        FLOAT temp_i;
-        FLOAT temp_r1;
-        FLOAT temp_i1;
-        FLOAT x0 = x_ptr[0];
-        FLOAT x1 = x_ptr[1];
-        x_ptr += inc_x;
-        FLOAT x2 = x_ptr[0];
-        FLOAT x3 = x_ptr[1];
-
-        while (j < (n & -2)) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
-#else
-
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
-#endif
-
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
-#else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-#endif
-
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j += 2;
-        }
-
-        while (j < n) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
-#else
-
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
-            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
-#endif
-
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
-
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j++;
-        }
-
-        return (0);
-    }
-
-    if (m3 == 1) {
-
-        FLOAT temp_r;
-        FLOAT temp_i;
-        FLOAT temp_r1;
-        FLOAT temp_i1;
-        FLOAT x0 = x_ptr[0];
-        FLOAT x1 = x_ptr[1];
-
-        while (j < (n & -2)) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
-#else
-
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-            a_ptr += lda;
-            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
-#endif
-
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
-            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
-#else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-            y_ptr += inc_y;
-            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
-            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
-#endif
-
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j += 2;
-        }
-
-        while (j < n) {
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
-#else
-
-            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
-            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
-#endif
-
-#if !defined(XCONJ) 
-            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
-            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
-#else
-            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
-            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
-#endif
-
-            a_ptr += lda;
-            y_ptr += inc_y;
-            j++;
-        }
-        return (0);
-    }
-
-    return (0);
-
-}
-#endif
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/zgemv_t.c"
+#else
+
+#include "common.h"
+
+#define NBMAX 1024 
+#include <altivec.h> 
+static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+
+static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
+    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
+    __vector float* vptr_a0 = (__vector float*) a0;
+    __vector float* vptr_a1 = (__vector float*) a1;
+    __vector float* vptr_a2 = (__vector float*) a2;
+    __vector float* vptr_a3 = (__vector float*) a3;
+    __vector float* v_x = (__vector float*) x;
+
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
+        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
+                
+        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
+        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
+
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1);
+        register __vector float va2   = vec_vsx_ld(i ,vptr_a2);
+        register __vector float va3   = vec_vsx_ld(i ,vptr_a3);
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
+        register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
+        register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
+
+
+        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
+        vtemp0_r += vxr_0*va0 + vxr_1*va0_1; 
+        vtemp1_p += vx_0*va1 + vx_1*va1_1;
+        vtemp1_r += vxr_0*va1 + vxr_1*va1_1; 
+        vtemp2_p += vx_0*va2 + vx_1*va2_1;
+        vtemp2_r += vxr_0*va2 + vxr_1*va2_1; 
+        vtemp3_p += vx_0*va3 + vx_1*va3_1;
+        vtemp3_r += vxr_0*va3 + vxr_1*va3_1; 
+
+    }
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
+
+    register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3];
+    register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3];
+
+    register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3];
+    register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3];
+
+#else
+    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3];
+
+    register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3];
+    register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3];
+
+    register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3];
+    register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3];
+
+#endif    
+
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
+    y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
+    y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+    y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
+    y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
+    y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
+    y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
+
+#endif
+
+}
+ 
+
+static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda; 
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
+    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; 
+
+
+    __vector float* vptr_a0 = (__vector float*) a0;
+    __vector float* vptr_a1 = (__vector float*) a1; 
+    __vector float* v_x = (__vector float*) x;
+
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
+        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
+                
+        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
+        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
+
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); 
+
+
+        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
+        vtemp0_r += vxr_0*va0 + vxr_1*va0_1; 
+        vtemp1_p += vx_0*va1 + vx_1*va1_1;
+        vtemp1_r += vxr_0*va1 + vxr_1*va1_1;  
+
+    }
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3];
+ 
+
+#else
+    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3];
+
+    register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3];
+    register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; 
+
+#endif    
+
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
+    y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; 
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
+    y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
+    y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; 
+
+#endif
+  
+}
+ 
+
+static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
+ 
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
+    //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
+    register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
+    register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; 
+    __vector float* vptr_a0 = (__vector float*) ap; 
+    __vector float* v_x = (__vector float*) x;
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
+        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
+                
+        register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
+        register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
+
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);  
+
+        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
+        vtemp0_r += vxr_0*va0 + vxr_1*va0_1;  
+    }
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+
+    register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; 
+
+#else
+    register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3];
+    register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; 
+
+#endif    
+
+#if !defined(XCONJ)
+
+    y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
+    y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; 
+
+#else
+
+    y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
+    y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; 
+
+#endif
+
+
+}
+ 
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest = *src;
+        *(dest + 1) = *(src + 1);
+        dest += 2;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i=0;
+    BLASLONG j=0;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer;
+
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    inc_x <<= 1;
+    inc_y <<= 1;
+    lda <<= 1;
+
+    xbuffer = buffer;
+
+    n1 = n >> 2;
+    n2 = n & 3;
+
+    m3 = m & 3;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 2)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        if (inc_y == 2) {
+
+            for (i = 0; i < n1; i++) {
+                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
+                a_ptr += lda << 2;
+                y_ptr += 8;
+
+            }
+
+            if (n2 & 2) {
+                cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
+                a_ptr += lda << 1;
+                y_ptr += 4;
+
+            }
+
+            if (n2 & 1) {
+                cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
+                a_ptr += lda;
+                y_ptr += 2;
+
+            }
+
+        } else {
+
+            for (i = 0; i < n1; i++) {
+                memset(ybuffer, 0, sizeof (ybuffer));
+                cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
+
+                a_ptr += lda << 2;
+
+                y_ptr[0] += ybuffer[0];
+                y_ptr[1] += ybuffer[1];
+                y_ptr += inc_y;
+                y_ptr[0] += ybuffer[2];
+                y_ptr[1] += ybuffer[3];
+                y_ptr += inc_y;
+                y_ptr[0] += ybuffer[4];
+                y_ptr[1] += ybuffer[5];
+                y_ptr += inc_y;
+                y_ptr[0] += ybuffer[6];
+                y_ptr[1] += ybuffer[7];
+                y_ptr += inc_y;
+
+            }
+
+            for (i = 0; i < n2; i++) {
+                memset(ybuffer, 0, sizeof (ybuffer));
+                cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
+                a_ptr += lda;
+                y_ptr[0] += ybuffer[0];
+                y_ptr[1] += ybuffer[1];
+                y_ptr += inc_y;
+
+            }
+
+        }
+        a += 2 * NB;
+        x += NB * inc_x;
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    j = 0;
+    a_ptr = a;
+    y_ptr = y;
+
+    if (m3 == 3) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        FLOAT x0 = x_ptr[0];
+        FLOAT x1 = x_ptr[1];
+        x_ptr += inc_x;
+        FLOAT x2 = x_ptr[0];
+        FLOAT x3 = x_ptr[1];
+        x_ptr += inc_x;
+        FLOAT x4 = x_ptr[0];
+        FLOAT x5 = x_ptr[1];
+        while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+            temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
+            temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+            temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
+            temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j++;
+        }
+        return (0);
+    }
+
+    if (m3 == 2) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        FLOAT temp_r1;
+        FLOAT temp_i1;
+        FLOAT x0 = x_ptr[0];
+        FLOAT x1 = x_ptr[1];
+        x_ptr += inc_x;
+        FLOAT x2 = x_ptr[0];
+        FLOAT x3 = x_ptr[1];
+
+        while (j < (n & -2)) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j += 2;
+        }
+
+        while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
+            temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j++;
+        }
+
+        return (0);
+    }
+
+    if (m3 == 1) {
+
+        FLOAT temp_r;
+        FLOAT temp_i;
+        FLOAT temp_r1;
+        FLOAT temp_i1;
+        FLOAT x0 = x_ptr[0];
+        FLOAT x1 = x_ptr[1];
+
+        while (j < (n & -2)) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+            a_ptr += lda;
+            temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
+            y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+            y_ptr += inc_y;
+            y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
+            y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j += 2;
+        }
+
+        while (j < n) {
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+            temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
+#else
+
+            temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
+            temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
+#endif
+
+#if !defined(XCONJ) 
+            y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
+            y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
+            y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+            a_ptr += lda;
+            y_ptr += inc_y;
+            j++;
+        }
+        return (0);
+    }
+
+    return (0);
+
+}
+#endif
diff --git a/kernel/power/crot.c b/kernel/power/crot.c
index 84ba5d913..dbd7e3482 100644
--- a/kernel/power/crot.c
+++ b/kernel/power/crot.c
@@ -1,233 +1,233 @@
-/***************************************************************************
-Copyright (c) 2013-2018, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#include "common.h"
- 
-#if defined(POWER8) || defined(POWER9) || defined(POWER10)
-#if defined(__VEC__) || defined(__ALTIVEC__)
-
-static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
-{
-  __vector float t0;
-  __vector float t1;
-  __vector float t2;
-  __vector float t3;
-  __vector float t4;
-  __vector float t5;
-  __vector float t6;
-  __vector float t7;
-  __asm__
-    (
-       "xscvdpspn   36, %x[cos]               \n\t" // load c to all words
-       "xxspltw     36, 36, 0                 \n\t" 
-       "xscvdpspn   37, %x[sin]               \n\t" // load s to all words
-       "xxspltw     37, 37, 0                 \n\t" 
-       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
-       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
-       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
-       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
-       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
-       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
-       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
-       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
-       "addi        %[x_ptr], %[x_ptr], 64    \n\t" 
-       "addi        %[y_ptr], %[y_ptr], 64    \n\t" 
-       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
-       "ble         two%=                        \n\t" 
-       ".align    5                         \n\t" 
-       "one%=:                                    \n\t" 
-       "xvmulsp     40, 32, 36                \n\t" // c * x
-       "xvmulsp     41, 33, 36                \n\t" 
-       "xvmulsp     42, 34, 36                \n\t" 
-       "xvmulsp     43, 35, 36                \n\t" 
-       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
-       "xvmulsp     %x[x2], 49, 36            \n\t" 
-       "xvmulsp     %x[x1], 50, 36            \n\t" 
-       "xvmulsp     %x[x3], 51, 36            \n\t" 
-       "xvmulsp     44, 32, 37                \n\t" // s * x
-       "xvmulsp     45, 33, 37                \n\t" 
-       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
-       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
-       "xvmulsp     46, 34, 37                \n\t" 
-       "xvmulsp     47, 35, 37                \n\t" 
-       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
-       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
-       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
-       "xvmulsp     %x[x5], 49, 37            \n\t" 
-       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
-       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
-       "xvmulsp     %x[x6], 50, 37            \n\t" 
-       "xvmulsp     %x[x7], 51, 37            \n\t" 
-       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
-       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
-       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
-       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
-       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
-       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
-       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
-       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
-       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
-       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
-       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
-       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
-       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
-       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
-       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
-       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
-       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
-       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x3], %[i48], %[y_ptr]  \n\t" 
-       "addi        %[x_ptr], %[x_ptr], 128   \n\t" 
-       "addi        %[y_ptr], %[y_ptr], 128   \n\t" 
-       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
-       "bgt         one%=                        \n\t" 
-       "two%=:                                    \n\t" 
-       "xvmulsp     40, 32, 36                \n\t" // c * x
-       "xvmulsp     41, 33, 36                \n\t" 
-       "xvmulsp     42, 34, 36                \n\t" 
-       "xvmulsp     43, 35, 36                \n\t" 
-       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
-       "xvmulsp     %x[x2], 49, 36            \n\t" 
-       "xvmulsp     %x[x1], 50, 36            \n\t" 
-       "xvmulsp     %x[x3], 51, 36            \n\t" 
-       "xvmulsp     44, 32, 37                \n\t" // s * x
-       "xvmulsp     45, 33, 37                \n\t" 
-       "xvmulsp     46, 34, 37                \n\t" 
-       "xvmulsp     47, 35, 37                \n\t" 
-       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
-       "xvmulsp     %x[x5], 49, 37            \n\t" 
-       "xvmulsp     %x[x6], 50, 37            \n\t" 
-       "xvmulsp     %x[x7], 51, 37            \n\t" 
-       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
-       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
-       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
-       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
-       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
-       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
-       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
-       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
-       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
-       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
-       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
-       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
-       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
-       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
-       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
-       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
-       "stxvd2x     %x[x3], %[i48], %[y_ptr]  "
-     :
-       [mem_x]  "+m"  (*(float (*)[2*n])x),
-       [mem_y]  "+m"  (*(float (*)[2*n])y),
-       [temp_n] "+r"  (n),
-       [x_ptr]  "+&b" (x),
-       [y_ptr]  "+&b" (y),
-       [x0]     "=wa" (t0),
-       [x1]     "=wa" (t2),
-       [x2]     "=wa" (t1),
-       [x3]     "=wa" (t3),
-       [x4]     "=wa" (t4),
-       [x5]     "=wa" (t5),
-       [x6]     "=wa" (t6),
-       [x7]     "=wa" (t7)     
-     : 
-       [cos]    "f"   (c),
-       [sin]    "f"   (s),
-       [i16]    "b"   (16),
-       [i32]    "b"   (32),
-       [i48]    "b"   (48)     
-     :
-       "cr0",
-       "vs32","vs33","vs34","vs35","vs36","vs37",
-       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
-       "vs48","vs49","vs50","vs51"
-     );
-}
- 
-#endif
-#endif
-
-
-int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
-{
-   BLASLONG i=0;
-    BLASLONG ix=0,iy=0;
-    FLOAT temp[2];
-    BLASLONG inc_x2;
-    BLASLONG inc_y2;
-
-    if ( n <= 0     )  return(0); 
-
-    if ( (inc_x == 1) && (inc_y == 1) )
-    {
-#if defined(__VEC__) || defined(__ALTIVEC__)
-        BLASLONG n1 = n & -8; 
-        if ( n1 > 0 )
-        { 
-            crot_kernel_8(n1, x, y, c, s);
-            i=n1; 
-            ix=2*n1; 
-        }
-#endif
-         while(i < n)
-           {
-                temp[0]   = c*x[ix]   + s*y[ix] ;
-                temp[1]   = c*x[ix+1] + s*y[ix+1] ;
-                y[ix]     = c*y[ix]   - s*x[ix] ;
-                y[ix+1]   = c*y[ix+1] - s*x[ix+1] ;
-                x[ix]     = temp[0] ;
-                x[ix+1]   = temp[1] ;
-
-                ix += 2 ; 
-                i++ ;
-
-            }
-
-    }
-    else
-    {
-        inc_x2 = 2 * inc_x ;
-        inc_y2 = 2 * inc_y ;
-        while(i < n)
-        {
-            temp[0]   = c*x[ix]   + s*y[iy] ;
-            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
-            y[iy]     = c*y[iy]   - s*x[ix] ;
-            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
-            x[ix]     = temp[0] ;
-            x[ix+1]   = temp[1] ;
-
-            ix += inc_x2 ;
-            iy += inc_y2 ;
-            i++ ;
-
-        }
-    }
-	return(0);
-}
-
+/***************************************************************************
+Copyright (c) 2013-2018, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#include "common.h"
+ 
+#if defined(POWER8) || defined(POWER9) || defined(POWER10)
+#if defined(__VEC__) || defined(__ALTIVEC__)
+
+static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
+{
+  __vector float t0;
+  __vector float t1;
+  __vector float t2;
+  __vector float t3;
+  __vector float t4;
+  __vector float t5;
+  __vector float t6;
+  __vector float t7;
+  __asm__
+    (
+       "xscvdpspn   36, %x[cos]               \n\t" // load c to all words
+       "xxspltw     36, 36, 0                 \n\t" 
+       "xscvdpspn   37, %x[sin]               \n\t" // load s to all words
+       "xxspltw     37, 37, 0                 \n\t" 
+       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
+       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
+       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
+       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
+       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
+       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
+       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
+       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
+       "addi        %[x_ptr], %[x_ptr], 64    \n\t" 
+       "addi        %[y_ptr], %[y_ptr], 64    \n\t" 
+       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
+       "ble         two%=                        \n\t" 
+       ".align    5                         \n\t" 
+       "one%=:                                    \n\t" 
+       "xvmulsp     40, 32, 36                \n\t" // c * x
+       "xvmulsp     41, 33, 36                \n\t" 
+       "xvmulsp     42, 34, 36                \n\t" 
+       "xvmulsp     43, 35, 36                \n\t" 
+       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
+       "xvmulsp     %x[x2], 49, 36            \n\t" 
+       "xvmulsp     %x[x1], 50, 36            \n\t" 
+       "xvmulsp     %x[x3], 51, 36            \n\t" 
+       "xvmulsp     44, 32, 37                \n\t" // s * x
+       "xvmulsp     45, 33, 37                \n\t" 
+       "lxvd2x      32, 0, %[x_ptr]           \n\t" // load x
+       "lxvd2x      33, %[i16], %[x_ptr]      \n\t" 
+       "xvmulsp     46, 34, 37                \n\t" 
+       "xvmulsp     47, 35, 37                \n\t" 
+       "lxvd2x      34, %[i32], %[x_ptr]      \n\t" 
+       "lxvd2x      35, %[i48], %[x_ptr]      \n\t" 
+       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
+       "xvmulsp     %x[x5], 49, 37            \n\t" 
+       "lxvd2x      48, 0, %[y_ptr]           \n\t" // load y
+       "lxvd2x      49, %[i16], %[y_ptr]      \n\t" 
+       "xvmulsp     %x[x6], 50, 37            \n\t" 
+       "xvmulsp     %x[x7], 51, 37            \n\t" 
+       "lxvd2x      50, %[i32], %[y_ptr]      \n\t" 
+       "lxvd2x      51, %[i48], %[y_ptr]      \n\t" 
+       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
+       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
+       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
+       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
+       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
+       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
+       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
+       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
+       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
+       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
+       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
+       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
+       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
+       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
+       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x3], %[i48], %[y_ptr]  \n\t" 
+       "addi        %[x_ptr], %[x_ptr], 128   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], 128   \n\t" 
+       "addic.      %[temp_n], %[temp_n], -8  \n\t" 
+       "bgt         one%=                        \n\t" 
+       "two%=:                                    \n\t" 
+       "xvmulsp     40, 32, 36                \n\t" // c * x
+       "xvmulsp     41, 33, 36                \n\t" 
+       "xvmulsp     42, 34, 36                \n\t" 
+       "xvmulsp     43, 35, 36                \n\t" 
+       "xvmulsp     %x[x0], 48, 36            \n\t" // c * y
+       "xvmulsp     %x[x2], 49, 36            \n\t" 
+       "xvmulsp     %x[x1], 50, 36            \n\t" 
+       "xvmulsp     %x[x3], 51, 36            \n\t" 
+       "xvmulsp     44, 32, 37                \n\t" // s * x
+       "xvmulsp     45, 33, 37                \n\t" 
+       "xvmulsp     46, 34, 37                \n\t" 
+       "xvmulsp     47, 35, 37                \n\t" 
+       "xvmulsp     %x[x4], 48, 37            \n\t" // s * y
+       "xvmulsp     %x[x5], 49, 37            \n\t" 
+       "xvmulsp     %x[x6], 50, 37            \n\t" 
+       "xvmulsp     %x[x7], 51, 37            \n\t" 
+       "addi        %[x_ptr], %[x_ptr], -64   \n\t" 
+       "addi        %[y_ptr], %[y_ptr], -64   \n\t" 
+       "xvaddsp     40, 40, %x[x4]            \n\t" // c * x + s * y
+       "xvaddsp     41, 41, %x[x5]            \n\t" // c * x + s * y
+       "xvaddsp     42, 42, %x[x6]            \n\t" // c * x + s * y
+       "xvaddsp     43, 43, %x[x7]            \n\t" // c * x + s * y
+       "xvsubsp     %x[x0], %x[x0], 44        \n\t" // c * y - s * x
+       "xvsubsp     %x[x2], %x[x2], 45        \n\t" // c * y - s * x
+       "xvsubsp     %x[x1], %x[x1], 46        \n\t" // c * y - s * x
+       "xvsubsp     %x[x3], %x[x3], 47        \n\t" // c * y - s * x
+       "stxvd2x     40, 0, %[x_ptr]           \n\t" // store x
+       "stxvd2x     41, %[i16], %[x_ptr]      \n\t" 
+       "stxvd2x     42, %[i32], %[x_ptr]      \n\t" 
+       "stxvd2x     43, %[i48], %[x_ptr]      \n\t" 
+       "stxvd2x     %x[x0], 0, %[y_ptr]       \n\t" // store y
+       "stxvd2x     %x[x2], %[i16], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x1], %[i32], %[y_ptr]  \n\t" 
+       "stxvd2x     %x[x3], %[i48], %[y_ptr]  "
+     :
+       [mem_x]  "+m"  (*(float (*)[2*n])x),
+       [mem_y]  "+m"  (*(float (*)[2*n])y),
+       [temp_n] "+r"  (n),
+       [x_ptr]  "+&b" (x),
+       [y_ptr]  "+&b" (y),
+       [x0]     "=wa" (t0),
+       [x1]     "=wa" (t2),
+       [x2]     "=wa" (t1),
+       [x3]     "=wa" (t3),
+       [x4]     "=wa" (t4),
+       [x5]     "=wa" (t5),
+       [x6]     "=wa" (t6),
+       [x7]     "=wa" (t7)     
+     : 
+       [cos]    "f"   (c),
+       [sin]    "f"   (s),
+       [i16]    "b"   (16),
+       [i32]    "b"   (32),
+       [i48]    "b"   (48)     
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+}
+ 
+#endif
+#endif
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+   BLASLONG i=0;
+    BLASLONG ix=0,iy=0;
+    FLOAT temp[2];
+    BLASLONG inc_x2;
+    BLASLONG inc_y2;
+
+    if ( n <= 0     )  return(0); 
+
+    if ( (inc_x == 1) && (inc_y == 1) )
+    {
+#if defined(__VEC__) || defined(__ALTIVEC__)
+        BLASLONG n1 = n & -8; 
+        if ( n1 > 0 )
+        { 
+            crot_kernel_8(n1, x, y, c, s);
+            i=n1; 
+            ix=2*n1; 
+        }
+#endif
+         while(i < n)
+           {
+                temp[0]   = c*x[ix]   + s*y[ix] ;
+                temp[1]   = c*x[ix+1] + s*y[ix+1] ;
+                y[ix]     = c*y[ix]   - s*x[ix] ;
+                y[ix+1]   = c*y[ix+1] - s*x[ix+1] ;
+                x[ix]     = temp[0] ;
+                x[ix+1]   = temp[1] ;
+
+                ix += 2 ; 
+                i++ ;
+
+            }
+
+    }
+    else
+    {
+        inc_x2 = 2 * inc_x ;
+        inc_y2 = 2 * inc_y ;
+        while(i < n)
+        {
+            temp[0]   = c*x[ix]   + s*y[iy] ;
+            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+            y[iy]     = c*y[iy]   - s*x[ix] ;
+            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+            x[ix]     = temp[0] ;
+            x[ix+1]   = temp[1] ;
+
+            ix += inc_x2 ;
+            iy += inc_y2 ;
+            i++ ;
+
+        }
+    }
+	return(0);
+}
+
diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S
index 2fb1b27ef..86108f20c 100644
--- a/kernel/power/dgemm_kernel_power9.S
+++ b/kernel/power/dgemm_kernel_power9.S
@@ -1,249 +1,249 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
- 
-#define LOAD	ld
- 
- 
-
- 
-#define STACKSIZE  (512 )
-#define ALPHA_SP   (296+192)(SP)
-#define FZERO	(304+192)(SP)
- 
-
- 
-#define	M	r3
-#define	N	r4
-#define	K	r5
- 
-#define A	r7
-#define	B	r8
-#define	C	r9
-#define	LDC	r10
-#define OFFSET	r6
- 
- 
-
-#define alpha_r vs18
-
-#define o0	0
-
-
-#define T4	r12
-#define T3	r11
-#define C4	r14
-#define o8	r15
-#define o24	r16
-#define C2	r17
-#define L	r18
-#define T1	r19
-#define C3	r20
-#define TEMP_REG	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO 	r26
-#define o16	r27
-#define	o32	r28
-#define	o48	r29
-
-#define PRE	r30
-#define T2	r31
-
-#include "dgemm_macros_power9.S"
-
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-	addi	SP, SP, -STACKSIZE
-	li	r0, 0
-
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
- 
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-    stxv    vs52,  288(SP)
-    stxv    vs53,  304(SP)
-    stxv    vs54,  320(SP)
-    stxv    vs55,  336(SP)
-    stxv    vs56,  352(SP)
-    stxv    vs57,  368(SP)
-    stxv    vs58,  384(SP) 
-    stxv    vs59,  400(SP) 
-    stxv    vs60,  416(SP)
-    stxv    vs61,  432(SP) 
-    stxv    vs62,  448(SP)
-    stxv    vs63,  464(SP)
-
-
-	stfd	f1,  ALPHA_SP
-	stw	r0,  FZERO 
-
-	slwi	LDC, LDC, BASE_SHIFT
-
-#if defined(TRMMKERNEL)
-	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
-#endif
-
-
-	cmpwi	cr0, M, 0
-	ble	.L999_H1
-	cmpwi	cr0, N, 0
-	ble	.L999_H1
-	cmpwi	cr0, K, 0
-	ble	.L999_H1
-
- 
- 
-   	addi	T1, SP, 296+192
- 
-
-	li	PRE, 384
-	li	o8 , 8
-	li	o16, 16
-	li	o24, 24
-	li	o32, 32
-	li	o48, 48
-
-
-	lxvdsx	alpha_r, 0, T1
-
-#include "dgemm_logic_power9.S"
-
-.L999:
-	addi	r3, 0, 0
-
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
- 
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP) 
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE
-	blr
-
-	EPILOGUE
-#endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+ 
+ 
+
+ 
+#define STACKSIZE  (512 )
+#define ALPHA_SP   (296+192)(SP)
+#define FZERO	(304+192)(SP)
+ 
+
+ 
+#define	M	r3
+#define	N	r4
+#define	K	r5
+ 
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+ 
+ 
+
+#define alpha_r vs18
+
+#define o0	0
+
+
+#define T4	r12
+#define T3	r11
+#define C4	r14
+#define o8	r15
+#define o24	r16
+#define C2	r17
+#define L	r18
+#define T1	r19
+#define C3	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define o16	r27
+#define	o32	r28
+#define	o48	r29
+
+#define PRE	r30
+#define T2	r31
+
+#include "dgemm_macros_power9.S"
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	li	r0, 0
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+    stxv    vs52,  288(SP)
+    stxv    vs53,  304(SP)
+    stxv    vs54,  320(SP)
+    stxv    vs55,  336(SP)
+    stxv    vs56,  352(SP)
+    stxv    vs57,  368(SP)
+    stxv    vs58,  384(SP) 
+    stxv    vs59,  400(SP) 
+    stxv    vs60,  416(SP)
+    stxv    vs61,  432(SP) 
+    stxv    vs62,  448(SP)
+    stxv    vs63,  464(SP)
+
+
+	stfd	f1,  ALPHA_SP
+	stw	r0,  FZERO 
+
+	slwi	LDC, LDC, BASE_SHIFT
+
+#if defined(TRMMKERNEL)
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+
+
+	cmpwi	cr0, M, 0
+	ble	.L999_H1
+	cmpwi	cr0, N, 0
+	ble	.L999_H1
+	cmpwi	cr0, K, 0
+	ble	.L999_H1
+
+ 
+ 
+   	addi	T1, SP, 296+192
+ 
+
+	li	PRE, 384
+	li	o8 , 8
+	li	o16, 16
+	li	o24, 24
+	li	o32, 32
+	li	o48, 48
+
+
+	lxvdsx	alpha_r, 0, T1
+
+#include "dgemm_logic_power9.S"
+
+.L999:
+	addi	r3, 0, 0
+
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+ 
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP) 
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE
+	blr
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/dgemm_logic_power9.S b/kernel/power/dgemm_logic_power9.S
index 251839d19..a48bc685a 100644
--- a/kernel/power/dgemm_logic_power9.S
+++ b/kernel/power/dgemm_logic_power9.S
@@ -1,1981 +1,1981 @@
-/***************************************************************************
-Copyright (c) 2013-2019 The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-
-#define MY_ALIGN .align 3
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-   neg TEMP_REG, OFFSET 
-#endif
-
-	srawi.		J,	N,	2
-	ble		LDGEMM_L4_END
-
-LDGEMM_L4_BEGIN:
-
- 
-	li		T1,	128
-	li		T2,	256
- 
-	mr		AO,	A
-	mr		CO,	C
-	slwi		T3,	LDC	,	2
-	add		C,	C,	T3
-
- 
-	dcbt		A,	T1
-	dcbt		A,	T2
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	srawi.		I,	M,	4
-	ble		LDGEMM_L4x16_END
-
-	MY_ALIGN
-LDGEMM_L4x16_BEGIN:
-
-	li		L,	-128
-
-
-	SAVE4x16_REGS
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
-#else
-	mr		BO,	B
-#endif	
- 
-	and		T1,	CO,	L
-	and		T2,	C2,	L
-	and		T3,	C3,	L
-	and		T4,	C4,	L
-
-	dcbt		T1,	r0
-	dcbt		T2,	r0
-	dcbt		T3,	r0
-	dcbt		T4,	r0
- 
-
-	addi		T1, T1, 128
-	addi		T2, T2, 128
-	addi		T3, T3, 128
-	addi		T4, T4, 128
-
-	dcbt		T1,	r0
-	dcbt		T2,	r0
-	dcbt		T3,	r0
-	dcbt		T4,	r0
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T3,K,TEMP_REG,16,4
-   srawi.		L, T3,	5
-#else
-   srawi.		L,	K,	5
-#endif 
-	
-	ble		LDGEMM_L4x16_SUB0
-
-
-	MY_ALIGN
-LDGEMM_L4x16_LOOP_START:
-
-	li	T2,	512
- 
- 
-	LOAD4x16_1
-    ##OffsetA=128 OffsetB=32
-    addi AO,AO,2176
- #   addi BO,BO,32 
-	addic.		L,	L,	-1
-
-	ble		LDGEMM_L4x16_LOOP_END
-
-	
-	mtctr		L
-
-	MY_ALIGN
-
-LDGEMM_L4x16_LOOP:
-
-	#dcbt	AO,	PRE
-    KERNEL4x16_I1_L2_2  -2048,32, 0,0
-    KERNEL4x16_I1_L2_2  -2048,32, 1,0
-    KERNEL4x16_I1_L2_2  -2048,32, 2,0
-    KERNEL4x16_I1_L2_2  -2048,32, 3,0
-    KERNEL4x16_I1_L2_2  -2048,32, 4,0
-    KERNEL4x16_I1_L2_2  -2048,32, 5,0        
-    KERNEL4x16_I1_L2_2  -2048,32, 6,0
-    KERNEL4x16_I1_L2_2  -2048,32, 7,0  
-    KERNEL4x16_I1_L2_2  -2048,32, 8,0      
-    KERNEL4x16_I1_L2_2  -2048,32, 9,0
-    KERNEL4x16_I1_L2_2  -2048,32, 10,0
-    KERNEL4x16_I1_L2_2  -2048,32, 11,0
-    KERNEL4x16_I1_L2_2  -2048,32, 12,0
-    KERNEL4x16_I1_L2_2  -2048,32, 13,0    
-    KERNEL4x16_I1_L2_2  -2048,32, 14,0    
-    KERNEL4x16_I1_L2_2  -2048,32, 15,1  	
-
-
-	bdnz		LDGEMM_L4x16_LOOP
-
-	MY_ALIGN
-	MY_ALIGN
-LDGEMM_L4x16_LOOP_END:
-
-    KERNEL4x16_I1_L2_2  -2048,32, 0,0
-    KERNEL4x16_I1_L2_2  -2048,32, 1,0
-    KERNEL4x16_I1_L2_2  -2048,32, 2,0
-    KERNEL4x16_I1_L2_2  -2048,32, 3,0
-    KERNEL4x16_I1_L2_2  -2048,32, 4,0
-    KERNEL4x16_I1_L2_2  -2048,32, 5,0        
-    KERNEL4x16_I1_L2_2  -2048,32, 6,0
-    KERNEL4x16_I1_L2_2  -2048,32, 7,0  
-    KERNEL4x16_I1_L2_2  -2048,32, 8,0      
-    KERNEL4x16_I1_L2_2  -2048,32, 9,0
-    KERNEL4x16_I1_L2_2  -2048,32, 10,0
-    KERNEL4x16_I1_L2_2  -2048,32, 11,0
-    KERNEL4x16_I1_L2_2  -2048,32, 12,0
-    KERNEL4x16_I1_L2_2  -2048,32, 13,0    
-    KERNEL4x16_I1_L2_2  -2048,32, 14,0    
-    KERNEL4x16_I1_L2_3  -2048,32, 15,1    
-	b		LDGEMM_L4x16_SUB1
-
-
-	MY_ALIGN
-LDGEMM_L4x16_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	31
-#else
-	andi.		L,	K,	31
-#endif
-	KERNEL4x16 1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L4x16_SAVE
-	b		LDGEMM_L4x16_SUB2
-	MY_ALIGN
-LDGEMM_L4x16_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	31
-#else
-	andi.		L,	K,	31
-#endif	
-	ble		LDGEMM_L4x16_SAVE
-	MY_ALIGN
-LDGEMM_L4x16_SUB2:
-
-    andi.      T1,L, 16
-    ble LDGEMM_L4x16_SUB2_8
-	LOAD4x16_0 
-    KERNEL4x16_I1_L2_2  128,32, 0,0
-    KERNEL4x16_I1_L2_2  128,32, 1,0
-    KERNEL4x16_I1_L2_2  128,32, 2,0
-    KERNEL4x16_I1_L2_2  128,32, 3,0
-    KERNEL4x16_I1_L2_2  128,32, 4,0
-    KERNEL4x16_I1_L2_2  128,32, 5,0        
-    KERNEL4x16_I1_L2_2  128,32, 6,0
-    KERNEL4x16_I1_L2_3  128,32, 7,1 
-    MY_ALIGN
-LDGEMM_L4x16_SUB2_8:
-    andi.      T1,L, 8
-    ble LDGEMM_L4x16_SUB2_4
-	LOAD4x16_0
-    KERNEL4x16_I1_L2_2  128,32, 0,0
-    KERNEL4x16_I1_L2_2  128,32, 1,0
-    KERNEL4x16_I1_L2_2  128,32, 2,0
-    KERNEL4x16_I1_L2_3  128,32, 3,1
-	MY_ALIGN
-LDGEMM_L4x16_SUB2_4:
-    andi.      T1,L, 4
-    ble LDGEMM_L4x16_SUB2_2 
-	LOAD4x16_0
-    KERNEL4x16_I1_L2_2  128,32, 0,0
-    KERNEL4x16_I1_L2_3  128,32, 1,1
-	MY_ALIGN	
-LDGEMM_L4x16_SUB2_2:
-    andi.      T1,L, 2
-    ble LDGEMM_L4x16_SUB2_1
-    LOAD4x16_0
-    KERNEL4x16_I1_L2_3  128,32, 0,1
-    MY_ALIGN
-LDGEMM_L4x16_SUB2_1:
-    andi.      T1,L, 1
-    ble LDGEMM_L4x16_SAVE	
-    KERNEL4x16 0
-#	addic.		L,	L,	-1
-#	bgt		LDGEMM_L4x16_SUB2
-
-	MY_ALIGN
-LDGEMM_L4x16_SAVE:
-	SAVE4x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4
-#endif	
-	addic.		I,	I,	-1
-	bgt+		LDGEMM_L4x16_BEGIN
-
-LDGEMM_L4x16_END:
-
-LDGEMM_L4x8_BEGIN:
-
-	andi.		T2,	M,	15
-	ble		LDGEMM_L4x1_END
-
-	andi.		T1,	M,	8
-	ble		LDGEMM_L4x8_END
-
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
-    REFRESH_TEMP_BK T3,K,TEMP_REG,8,4
-    srawi.		L, T3,	4	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	4	
-#endif		 
- 
-
-	ble		LDGEMM_L4x8_SUB0
-
-LDGEMM_L4x8_LOOP_START:
-
-
-	LOAD4x8_1
-    ##OffsetA=64 OffsetB=32
-
-
-	addic.		L,	L,	-1
-
-	ble		LDGEMM_L4x8_LOOP_END
-
-    mtctr		L
-	MY_ALIGN
-
-LDGEMM_L4x8_LOOP:
-
-    KERNEL4x8_I1_L2_2  64,32, 0,0
-    KERNEL4x8_I1_L2_2  64,32, 1,0
-    KERNEL4x8_I1_L2_2  64,32, 2,0
-    KERNEL4x8_I1_L2_2  64,32, 3,0
-    KERNEL4x8_I1_L2_2  64,32, 4,0
-    KERNEL4x8_I1_L2_2  64,32, 5,0        
-    KERNEL4x8_I1_L2_2  64,32, 6,0
-    KERNEL4x8_I1_L2_2  64,32, 7,1     
-
-	bdnz		LDGEMM_L4x8_LOOP
-	MY_ALIGN
-LDGEMM_L4x8_LOOP_END:
-
-    KERNEL4x8_I1_L2_2  64,32, 0,0
-    KERNEL4x8_I1_L2_2  64,32, 1,0
-    KERNEL4x8_I1_L2_2  64,32, 2,0
-    KERNEL4x8_I1_L2_2  64,32, 3,0
-    KERNEL4x8_I1_L2_2  64,32, 4,0
-    KERNEL4x8_I1_L2_2  64,32, 5,0        
-    KERNEL4x8_I1_L2_2  64,32, 6,0
-    KERNEL4x8_I1_L2_3  64,32, 7,1  
-
-	b		LDGEMM_L4x8_SUB1
-	MY_ALIGN
-LDGEMM_L4x8_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	15
-#else
-	andi.		L,	K,	15
-#endif
-	KERNEL4x8 1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L4x8_SAVE
-	b		LDGEMM_L4x8_SUB2
-	MY_ALIGN
-LDGEMM_L4x8_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	15
-#else
-	andi.		L,	K,	15
-#endif	
-	ble		LDGEMM_L4x8_SAVE
-	MY_ALIGN
-LDGEMM_L4x8_SUB2:
-
-    andi.      T1,L, 8
-    ble LDGEMM_L4x8_SUB2_4
-	LOAD4x8_0
-    KERNEL4x8_I1_L2_2  64,32, 0,0
-    KERNEL4x8_I1_L2_2  64,32, 1,0
-    KERNEL4x8_I1_L2_2  64,32, 2,0
-    KERNEL4x8_I1_L2_3  64,32, 3,1
-	MY_ALIGN
-LDGEMM_L4x8_SUB2_4:
-    andi.      T1,L, 4
-    ble LDGEMM_L4x8_SUB2_2 
-	LOAD4x8_0
-    KERNEL4x8_I1_L2_2  64,32, 0,0
-    KERNEL4x8_I1_L2_3  64,32, 1,1
-	MY_ALIGN	
-LDGEMM_L4x8_SUB2_2:
-    andi.      T1,L, 2
-    ble LDGEMM_L4x8_SUB2_1
-    LOAD4x8_0
-    KERNEL4x8_I1_L2_3  64,32, 0,1
-    MY_ALIGN
-LDGEMM_L4x8_SUB2_1:
-    andi.      T1,L, 1
-    ble LDGEMM_L4x8_SAVE	
-    KERNEL4x8 0
- 
-	MY_ALIGN
-LDGEMM_L4x8_SAVE:
-	SAVE4x8
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4
-#endif	
-LDGEMM_L4x8_END:
-
-LDGEMM_L4x4_BEGIN:
-
-
-	andi.		T1,	M,	4
-	ble		LDGEMM_L4x4_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
-    REFRESH_TEMP_BK T3,K,TEMP_REG,4,4
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif		
-	ble		LDGEMM_L4x4_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L4x4_SUB4
-
-LDGEMM_L4x4_LOOP_START:
-
-	#dcbt	AO,	PRE
-	LOAD4x4_1
-	KERNEL4x4_I1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	#dcbt	AO,	PRE
-	KERNEL4x4_2
-
-	KERNEL4x4_1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	#dcbt	AO,	PRE
-	KERNEL4x4_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L4x4_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L4x4_LOOP:
-
-	KERNEL4x4_1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	#dcbt	AO,	PRE
-	KERNEL4x4_2
-
-	KERNEL4x4_1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	#dcbt	AO,	PRE
-	KERNEL4x4_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x4_LOOP
-
-LDGEMM_L4x4_LOOP_END:
-
-	KERNEL4x4_1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	KERNEL4x4_2
-
-	KERNEL4x4_1
-	KERNEL4x4_2
-	KERNEL4x4_1
-	KERNEL4x4_E2
-
-	b		LDGEMM_L4x4_SUB1
-
-LDGEMM_L4x4_SUB4:
-
-	KERNEL4x4_SUBI1
-	KERNEL4x4_SUB1
-	KERNEL4x4_SUB1
-	KERNEL4x4_SUB1
-
-	KERNEL4x4_SUB1
-	KERNEL4x4_SUB1
-	KERNEL4x4_SUB1
-	KERNEL4x4_SUB1
-
-	b		LDGEMM_L4x4_SUB1
-
-LDGEMM_L4x4_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL4x4_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L4x4_SAVE
-	b		LDGEMM_L4x4_SUB2
-
-LDGEMM_L4x4_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L4x4_SAVE
-
-LDGEMM_L4x4_SUB2:
-
-	KERNEL4x4_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x4_SUB2
-
-LDGEMM_L4x4_SAVE:
-
-	SAVE4x4
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4
-#endif	
-LDGEMM_L4x4_END:
-
-LDGEMM_L4x2_BEGIN:
-
-
-	andi.		T1,	M,	2
-	ble		LDGEMM_L4x2_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
-    REFRESH_TEMP_BK T3,K,TEMP_REG,2,4
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L4x2_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L4x2_SUB4
-
-LDGEMM_L4x2_LOOP_START:
-
-	LOAD4x2_1
-	KERNEL4x2_I1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_2
-
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L4x2_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L4x2_LOOP:
-
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_2
-
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x2_LOOP
-
-LDGEMM_L4x2_LOOP_END:
-
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_2
-
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_1
-	KERNEL4x2_E2
-
-	b		LDGEMM_L4x2_SUB1
-
-LDGEMM_L4x2_SUB4:
-
-	KERNEL4x2_SUBI1
-	KERNEL4x2_SUB1
-	KERNEL4x2_SUB1
-	KERNEL4x2_SUB1
-
-	KERNEL4x2_SUB1
-	KERNEL4x2_SUB1
-	KERNEL4x2_SUB1
-	KERNEL4x2_SUB1
-
-	b		LDGEMM_L4x2_SUB1
-
-LDGEMM_L4x2_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL4x2_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L4x2_SAVE
-	b		LDGEMM_L4x2_SUB2
-
-LDGEMM_L4x2_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L4x2_SAVE
-
-LDGEMM_L4x2_SUB2:
-
-	KERNEL4x2_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x2_SUB2
-
-LDGEMM_L4x2_SAVE:
-
-	SAVE4x2
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4
-#endif	
-LDGEMM_L4x2_END:
-
-LDGEMM_L4x1_BEGIN:
-
-
-	andi.		T1,	M,	1
-	ble		LDGEMM_L4x1_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
-    REFRESH_TEMP_BK T3,K,TEMP_REG,1,4
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L4x1_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L4x1_SUB4
-
-LDGEMM_L4x1_LOOP_START:
-
-	LOAD4x1_1
-	KERNEL4x1_I1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_2
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L4x1_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L4x1_LOOP:
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_2
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x1_LOOP
-
-LDGEMM_L4x1_LOOP_END:
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_2
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_1
-	KERNEL4x1_E2
-
-	b		LDGEMM_L4x1_SUB1
-
-LDGEMM_L4x1_SUB4:
-
-	KERNEL4x1_SUBI1
-	KERNEL4x1_SUB1
-	KERNEL4x1_SUB1
-	KERNEL4x1_SUB1
-
-	KERNEL4x1_SUB1
-	KERNEL4x1_SUB1
-	KERNEL4x1_SUB1
-	KERNEL4x1_SUB1
-
-	b		LDGEMM_L4x1_SUB1
-
-LDGEMM_L4x1_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL4x1_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L4x1_SAVE
-	b		LDGEMM_L4x1_SUB2
-
-LDGEMM_L4x1_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L4x1_SAVE
-
-LDGEMM_L4x1_SUB2:
-
-	KERNEL4x1_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L4x1_SUB2
-
-LDGEMM_L4x1_SAVE:
-
-	SAVE4x1
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4
-#endif	
-LDGEMM_L4x1_END:
-
-	slwi		T1,	K,	5
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 4
-#endif
-	addic.		J,	J,	-1
-	bgt		LDGEMM_L4_BEGIN
-
-	andi.		T2,	N,	3
-	ble		.L999
-
-LDGEMM_L4_END:
-
-	b		LDGEMM_L2_BEGIN
-
-.L999_H1:
-
-	b		.L999
-
-LDGEMM_L2_BEGIN:
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	andi.		T1,	N,	2
-	ble		LDGEMM_L2_END
-	mr		CO,	C
-	mr		AO,	A
-	slwi		T1,	LDC	,	1
-	add		C,	C,	T1
-	srawi.		I,	M,	4
-	ble		LDGEMM_L2x16_END
-
-LDGEMM_L2x16_BEGIN:
-
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
-    REFRESH_TEMP_BK T3,K,TEMP_REG,16,2
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L2x16_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L2x16_SUB4
-
-LDGEMM_L2x16_LOOP_START:
-
-	#dcbt		AO,	PRE
-	LOAD2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_I1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L2x16_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L2x16_LOOP:
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x16_LOOP
-
-LDGEMM_L2x16_LOOP_END:
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	#dcbt		AO,	PRE
-	KERNEL2x16_2
-	#dcbt		AO,	PRE
-	KERNEL2x16_1
-	KERNEL2x16_E2
-
-	b		LDGEMM_L2x16_SUB1
-
-LDGEMM_L2x16_SUB4:
-
-	#dcbt		AO,	PRE
-	KERNEL2x16_SUBI1
-	#dcbt		AO,	PRE
-	KERNEL2x16_SUB1
-	#dcbt		AO,	PRE
-	KERNEL2x16_SUB1
-	#dcbt		AO,	PRE
-	KERNEL2x16_SUB1
-
-	KERNEL2x16_SUB1
-	KERNEL2x16_SUB1
-	KERNEL2x16_SUB1
-	KERNEL2x16_SUB1
-
-	b		LDGEMM_L2x16_SUB1
-
-LDGEMM_L2x16_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL2x16_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L2x16_SAVE
-	b		LDGEMM_L2x16_SUB2
-
-LDGEMM_L2x16_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L2x16_SAVE
-
-LDGEMM_L2x16_SUB2:
-
-	KERNEL2x16_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x16_SUB2
-
-LDGEMM_L2x16_SAVE:
-
-	SAVE2x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2
-#endif	
-	addic.		I,	I,	-1
-	bgt		LDGEMM_L2x16_BEGIN
-
-LDGEMM_L2x16_END:
-
-LDGEMM_L2x8_BEGIN:
-
-	andi.		T2,	M,	15
-	ble		LDGEMM_L2x1_END
-
-	andi.		T1,	M,	8
-	ble		LDGEMM_L2x8_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
-    REFRESH_TEMP_BK T3,K,TEMP_REG,8,2
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L2x8_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L2x8_SUB4
-
-LDGEMM_L2x8_LOOP_START:
-
-	#dcbt	AO,	PRE
-	LOAD2x8_1
-	KERNEL2x8_I1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L2x8_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L2x8_LOOP:
-
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-	KERNEL2x8_1
-	#dcbt	AO,	PRE
-	KERNEL2x8_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x8_LOOP
-
-LDGEMM_L2x8_LOOP_END:
-
-	KERNEL2x8_1
-	KERNEL2x8_2
-	KERNEL2x8_1
-	KERNEL2x8_2
-
-	KERNEL2x8_1
-	KERNEL2x8_2
-	KERNEL2x8_1
-	KERNEL2x8_E2
-
-	b		LDGEMM_L2x8_SUB1
-
-LDGEMM_L2x8_SUB4:
-
-	KERNEL2x8_SUBI1
-	KERNEL2x8_SUB1
-	KERNEL2x8_SUB1
-	KERNEL2x8_SUB1
-
-	KERNEL2x8_SUB1
-	KERNEL2x8_SUB1
-	KERNEL2x8_SUB1
-	KERNEL2x8_SUB1
-
-	b		LDGEMM_L2x8_SUB1
-
-LDGEMM_L2x8_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL2x8_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L2x8_SAVE
-	b		LDGEMM_L2x8_SUB2
-
-LDGEMM_L2x8_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L2x8_SAVE
-
-LDGEMM_L2x8_SUB2:
-
-	KERNEL2x8_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x8_SUB2
-
-LDGEMM_L2x8_SAVE:
-
-	SAVE2x8
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2
-#endif
-LDGEMM_L2x8_END:
-
-LDGEMM_L2x4_BEGIN:
-
-
-	andi.		T1,	M,	4
-	ble		LDGEMM_L2x4_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
-    REFRESH_TEMP_BK T3,K,TEMP_REG,4,2
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L2x4_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L2x4_SUB4
-
-LDGEMM_L2x4_LOOP_START:
-
-	LOAD2x4_1
-	KERNEL2x4_I1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_2
-
-	KERNEL2x4_1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L2x4_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L2x4_LOOP:
-
-	KERNEL2x4_1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_2
-
-	KERNEL2x4_1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x4_LOOP
-
-LDGEMM_L2x4_LOOP_END:
-
-	KERNEL2x4_1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_2
-
-	KERNEL2x4_1
-	KERNEL2x4_2
-	KERNEL2x4_1
-	KERNEL2x4_E2
-
-	b		LDGEMM_L2x4_SUB1
-
-LDGEMM_L2x4_SUB4:
-
-	KERNEL2x4_SUBI1
-	KERNEL2x4_SUB1
-	KERNEL2x4_SUB1
-	KERNEL2x4_SUB1
-
-	KERNEL2x4_SUB1
-	KERNEL2x4_SUB1
-	KERNEL2x4_SUB1
-	KERNEL2x4_SUB1
-
-	b		LDGEMM_L2x4_SUB1
-
-LDGEMM_L2x4_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL2x4_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L2x4_SAVE
-	b		LDGEMM_L2x4_SUB2
-
-LDGEMM_L2x4_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L2x4_SAVE
-
-LDGEMM_L2x4_SUB2:
-
-	KERNEL2x4_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x4_SUB2
-
-LDGEMM_L2x4_SAVE:
-
-	SAVE2x4
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2
-#endif
-LDGEMM_L2x4_END:
-
-LDGEMM_L2x2_BEGIN:
-
-
-	andi.		T1,	M,	2
-	ble		LDGEMM_L2x2_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
-    REFRESH_TEMP_BK T3,K,TEMP_REG,2,2
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L2x2_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L2x2_SUB4
-
-LDGEMM_L2x2_LOOP_START:
-
-	LOAD2x2_1
-	KERNEL2x2_I1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_2
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L2x2_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L2x2_LOOP:
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_2
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x2_LOOP
-
-LDGEMM_L2x2_LOOP_END:
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_2
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_1
-	KERNEL2x2_E2
-
-	b		LDGEMM_L2x2_SUB1
-
-LDGEMM_L2x2_SUB4:
-
-	KERNEL2x2_SUBI1
-	KERNEL2x2_SUB1
-	KERNEL2x2_SUB1
-	KERNEL2x2_SUB1
-
-	KERNEL2x2_SUB1
-	KERNEL2x2_SUB1
-	KERNEL2x2_SUB1
-	KERNEL2x2_SUB1
-
-	b		LDGEMM_L2x2_SUB1
-
-LDGEMM_L2x2_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL2x2_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L2x2_SAVE
-	b		LDGEMM_L2x2_SUB2
-
-LDGEMM_L2x2_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L2x2_SAVE
-
-LDGEMM_L2x2_SUB2:
-
-	KERNEL2x2_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x2_SUB2
-
-LDGEMM_L2x2_SAVE:
-
-	SAVE2x2
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2
-#endif
-LDGEMM_L2x2_END:
-
-LDGEMM_L2x1_BEGIN:
-
-
-	andi.		T1,	M,	1
-	ble		LDGEMM_L2x1_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
-    REFRESH_TEMP_BK T3,K,TEMP_REG,1,2
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L2x1_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L2x1_SUB4
-
-LDGEMM_L2x1_LOOP_START:
-
-	LOAD2x1_1
-	KERNEL2x1_I1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_2
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L2x1_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L2x1_LOOP:
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_2
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x1_LOOP
-
-LDGEMM_L2x1_LOOP_END:
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_2
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_1
-	KERNEL2x1_E2
-
-	b		LDGEMM_L2x1_SUB1
-
-LDGEMM_L2x1_SUB4:
-
-	KERNEL2x1_SUBI1
-	KERNEL2x1_SUB1
-	KERNEL2x1_SUB1
-	KERNEL2x1_SUB1
-
-	KERNEL2x1_SUB1
-	KERNEL2x1_SUB1
-	KERNEL2x1_SUB1
-	KERNEL2x1_SUB1
-
-	b		LDGEMM_L2x1_SUB1
-
-LDGEMM_L2x1_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL2x1_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L2x1_SAVE
-	b		LDGEMM_L2x1_SUB2
-
-LDGEMM_L2x1_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L2x1_SAVE
-
-LDGEMM_L2x1_SUB2:
-
-	KERNEL2x1_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L2x1_SUB2
-
-LDGEMM_L2x1_SAVE:
-
-	SAVE2x1
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2
-#endif
-LDGEMM_L2x1_END:
-
-	slwi		T1,	K,	4
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 2
-#endif
-LDGEMM_L2_END:
-LDGEMM_L1_BEGIN:
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	andi.		T1,	N,	1
-	ble		LDGEMM_L1_END
-	mr		CO,	C
-	mr		AO,	A
-	srawi.		I,	M,	4
-	ble		LDGEMM_L1x16_END
-
-LDGEMM_L1x16_BEGIN:
-
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
-    REFRESH_TEMP_BK T3,K,TEMP_REG,16,1
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L1x16_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L1x16_SUB4
-
-LDGEMM_L1x16_LOOP_START:
-
-	#dcbt		AO,	PRE
-	LOAD1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_I1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L1x16_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L1x16_LOOP:
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x16_LOOP
-
-LDGEMM_L1x16_LOOP_END:
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	#dcbt		AO,	PRE
-	KERNEL1x16_2
-	#dcbt		AO,	PRE
-	KERNEL1x16_1
-	KERNEL1x16_E2
-
-	b		LDGEMM_L1x16_SUB1
-
-LDGEMM_L1x16_SUB4:
-
-	#dcbt		AO,	PRE
-	KERNEL1x16_SUBI1
-	#dcbt		AO,	PRE
-	KERNEL1x16_SUB1
-	#dcbt		AO,	PRE
-	KERNEL1x16_SUB1
-	#dcbt		AO,	PRE
-	KERNEL1x16_SUB1
-
-	KERNEL1x16_SUB1
-	KERNEL1x16_SUB1
-	KERNEL1x16_SUB1
-	KERNEL1x16_SUB1
-
-	b		LDGEMM_L1x16_SUB1
-
-LDGEMM_L1x16_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL1x16_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L1x16_SAVE
-	b		LDGEMM_L1x16_SUB2
-
-LDGEMM_L1x16_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L1x16_SAVE
-
-LDGEMM_L1x16_SUB2:
-
-	KERNEL1x16_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x16_SUB2
-
-LDGEMM_L1x16_SAVE:
-
-	SAVE1x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1
-#endif
-	addic.		I,	I,	-1
-	bgt		LDGEMM_L1x16_BEGIN
-
-LDGEMM_L1x16_END:
-
-LDGEMM_L1x8_BEGIN:
-
-	andi.		T2,	M,	15
-	ble		LDGEMM_L1x1_END
-
-	andi.		T1,	M,	8
-	ble		LDGEMM_L1x8_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
-    REFRESH_TEMP_BK T3,K,TEMP_REG,8,1
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L1x8_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L1x8_SUB4
-
-LDGEMM_L1x8_LOOP_START:
-
-	#dcbt	AO,	PRE
-	LOAD1x8_1
-	KERNEL1x8_I1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L1x8_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L1x8_LOOP:
-
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-	KERNEL1x8_1
-	#dcbt	AO,	PRE
-	KERNEL1x8_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x8_LOOP
-
-LDGEMM_L1x8_LOOP_END:
-
-	KERNEL1x8_1
-	KERNEL1x8_2
-	KERNEL1x8_1
-	KERNEL1x8_2
-
-	KERNEL1x8_1
-	KERNEL1x8_2
-	KERNEL1x8_1
-	KERNEL1x8_E2
-
-	b		LDGEMM_L1x8_SUB1
-
-LDGEMM_L1x8_SUB4:
-
-	KERNEL1x8_SUBI1
-	KERNEL1x8_SUB1
-	KERNEL1x8_SUB1
-	KERNEL1x8_SUB1
-
-	KERNEL1x8_SUB1
-	KERNEL1x8_SUB1
-	KERNEL1x8_SUB1
-	KERNEL1x8_SUB1
-
-	b		LDGEMM_L1x8_SUB1
-
-LDGEMM_L1x8_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL1x8_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L1x8_SAVE
-	b		LDGEMM_L1x8_SUB2
-
-LDGEMM_L1x8_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L1x8_SAVE
-
-LDGEMM_L1x8_SUB2:
-
-	KERNEL1x8_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x8_SUB2
-
-LDGEMM_L1x8_SAVE:
-
-	SAVE1x8
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1
-#endif
-LDGEMM_L1x8_END:
-
-LDGEMM_L1x4_BEGIN:
-
-
-	andi.		T1,	M,	4
-	ble		LDGEMM_L1x4_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
-    REFRESH_TEMP_BK T3,K,TEMP_REG,4,1
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L1x4_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L1x4_SUB4
-
-LDGEMM_L1x4_LOOP_START:
-
-	LOAD1x4_1
-	KERNEL1x4_I1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_2
-
-	KERNEL1x4_1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L1x4_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L1x4_LOOP:
-
-	KERNEL1x4_1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_2
-
-	KERNEL1x4_1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x4_LOOP
-
-LDGEMM_L1x4_LOOP_END:
-
-	KERNEL1x4_1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_2
-
-	KERNEL1x4_1
-	KERNEL1x4_2
-	KERNEL1x4_1
-	KERNEL1x4_E2
-
-	b		LDGEMM_L1x4_SUB1
-
-LDGEMM_L1x4_SUB4:
-
-	KERNEL1x4_SUBI1
-	KERNEL1x4_SUB1
-	KERNEL1x4_SUB1
-	KERNEL1x4_SUB1
-
-	KERNEL1x4_SUB1
-	KERNEL1x4_SUB1
-	KERNEL1x4_SUB1
-	KERNEL1x4_SUB1
-
-	b		LDGEMM_L1x4_SUB1
-
-LDGEMM_L1x4_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL1x4_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L1x4_SAVE
-	b		LDGEMM_L1x4_SUB2
-
-LDGEMM_L1x4_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L1x4_SAVE
-
-LDGEMM_L1x4_SUB2:
-
-	KERNEL1x4_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x4_SUB2
-
-LDGEMM_L1x4_SAVE:
-
-	SAVE1x4
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1
-#endif
-LDGEMM_L1x4_END:
-
-LDGEMM_L1x2_BEGIN:
-
-
-	andi.		T1,	M,	2
-	ble		LDGEMM_L1x2_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
-    REFRESH_TEMP_BK T3,K,TEMP_REG,2,1
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L1x2_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L1x2_SUB4
-
-LDGEMM_L1x2_LOOP_START:
-
-	LOAD1x2_1
-	KERNEL1x2_I1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_2
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L1x2_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L1x2_LOOP:
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_2
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x2_LOOP
-
-LDGEMM_L1x2_LOOP_END:
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_2
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_1
-	KERNEL1x2_E2
-
-	b		LDGEMM_L1x2_SUB1
-
-LDGEMM_L1x2_SUB4:
-
-	KERNEL1x2_SUBI1
-	KERNEL1x2_SUB1
-	KERNEL1x2_SUB1
-	KERNEL1x2_SUB1
-
-	KERNEL1x2_SUB1
-	KERNEL1x2_SUB1
-	KERNEL1x2_SUB1
-	KERNEL1x2_SUB1
-
-	b		LDGEMM_L1x2_SUB1
-
-LDGEMM_L1x2_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL1x2_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L1x2_SAVE
-	b		LDGEMM_L1x2_SUB2
-
-LDGEMM_L1x2_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L1x2_SAVE
-
-LDGEMM_L1x2_SUB2:
-
-	KERNEL1x2_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x2_SUB2
-
-LDGEMM_L1x2_SAVE:
-
-	SAVE1x2
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1
-#endif
-LDGEMM_L1x2_END:
-
-LDGEMM_L1x1_BEGIN:
-
-
-	andi.		T1,	M,	1
-	ble		LDGEMM_L1x1_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
-    REFRESH_TEMP_BK T3,K,TEMP_REG,1,1
-    srawi.		L, T3,	3	
-#else
-	mr		BO,	B
-	srawi.		L,	K,	3	
-#endif	
-	ble		LDGEMM_L1x1_SUB0
-	cmpwi		cr0,	L,	1
-	ble		LDGEMM_L1x1_SUB4
-
-LDGEMM_L1x1_LOOP_START:
-
-	LOAD1x1_1
-	KERNEL1x1_I1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_2
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_2
-
-	addic.		L,	L,	-2
-	ble		LDGEMM_L1x1_LOOP_END
-
-	MY_ALIGN
-
-LDGEMM_L1x1_LOOP:
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_2
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_2
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x1_LOOP
-
-LDGEMM_L1x1_LOOP_END:
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_2
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_1
-	KERNEL1x1_E2
-
-	b		LDGEMM_L1x1_SUB1
-
-LDGEMM_L1x1_SUB4:
-
-	KERNEL1x1_SUBI1
-	KERNEL1x1_SUB1
-	KERNEL1x1_SUB1
-	KERNEL1x1_SUB1
-
-	KERNEL1x1_SUB1
-	KERNEL1x1_SUB1
-	KERNEL1x1_SUB1
-	KERNEL1x1_SUB1
-
-	b		LDGEMM_L1x1_SUB1
-
-LDGEMM_L1x1_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-
-	KERNEL1x1_SUBI1
-
-	addic.		L,	L,	-1
-	ble		LDGEMM_L1x1_SAVE
-	b		LDGEMM_L1x1_SUB2
-
-LDGEMM_L1x1_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T3,	7
-#else
-	andi.		L,	K,	7
-#endif
-	ble		LDGEMM_L1x1_SAVE
-
-LDGEMM_L1x1_SUB2:
-
-	KERNEL1x1_SUB1
-
-	addic.		L,	L,	-1
-	bgt		LDGEMM_L1x1_SUB2
-
-LDGEMM_L1x1_SAVE:
-
-	SAVE1x1
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1
-#endif
-LDGEMM_L1x1_END:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 1
-#endif
-LDGEMM_L1_END:
+/***************************************************************************
+Copyright (c) 2013-2019 The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#define MY_ALIGN .align 3
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   neg TEMP_REG, OFFSET 
+#endif
+
+	srawi.		J,	N,	2
+	ble		LDGEMM_L4_END
+
+LDGEMM_L4_BEGIN:
+
+ 
+	li		T1,	128
+	li		T2,	256
+ 
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	2
+	add		C,	C,	T3
+
+ 
+	dcbt		A,	T1
+	dcbt		A,	T2
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LDGEMM_L4x16_END
+
+	MY_ALIGN
+LDGEMM_L4x16_BEGIN:
+
+	li		L,	-128
+
+
+	SAVE4x16_REGS
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
+#else
+	mr		BO,	B
+#endif	
+ 
+	and		T1,	CO,	L
+	and		T2,	C2,	L
+	and		T3,	C3,	L
+	and		T4,	C4,	L
+
+	dcbt		T1,	r0
+	dcbt		T2,	r0
+	dcbt		T3,	r0
+	dcbt		T4,	r0
+ 
+
+	addi		T1, T1, 128
+	addi		T2, T2, 128
+	addi		T3, T3, 128
+	addi		T4, T4, 128
+
+	dcbt		T1,	r0
+	dcbt		T2,	r0
+	dcbt		T3,	r0
+	dcbt		T4,	r0
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T3,K,TEMP_REG,16,4
+   srawi.		L, T3,	5
+#else
+   srawi.		L,	K,	5
+#endif 
+	
+	ble		LDGEMM_L4x16_SUB0
+
+
+	MY_ALIGN
+LDGEMM_L4x16_LOOP_START:
+
+	li	T2,	512
+ 
+ 
+	LOAD4x16_1
+    ##OffsetA=128 OffsetB=32
+    addi AO,AO,2176
+ #   addi BO,BO,32 
+	addic.		L,	L,	-1
+
+	ble		LDGEMM_L4x16_LOOP_END
+
+	
+	mtctr		L
+
+	MY_ALIGN
+
+LDGEMM_L4x16_LOOP:
+
+	#dcbt	AO,	PRE
+    KERNEL4x16_I1_L2_2  -2048,32, 0,0
+    KERNEL4x16_I1_L2_2  -2048,32, 1,0
+    KERNEL4x16_I1_L2_2  -2048,32, 2,0
+    KERNEL4x16_I1_L2_2  -2048,32, 3,0
+    KERNEL4x16_I1_L2_2  -2048,32, 4,0
+    KERNEL4x16_I1_L2_2  -2048,32, 5,0        
+    KERNEL4x16_I1_L2_2  -2048,32, 6,0
+    KERNEL4x16_I1_L2_2  -2048,32, 7,0  
+    KERNEL4x16_I1_L2_2  -2048,32, 8,0      
+    KERNEL4x16_I1_L2_2  -2048,32, 9,0
+    KERNEL4x16_I1_L2_2  -2048,32, 10,0
+    KERNEL4x16_I1_L2_2  -2048,32, 11,0
+    KERNEL4x16_I1_L2_2  -2048,32, 12,0
+    KERNEL4x16_I1_L2_2  -2048,32, 13,0    
+    KERNEL4x16_I1_L2_2  -2048,32, 14,0    
+    KERNEL4x16_I1_L2_2  -2048,32, 15,1  	
+
+
+	bdnz		LDGEMM_L4x16_LOOP
+
+	MY_ALIGN
+	MY_ALIGN
+LDGEMM_L4x16_LOOP_END:
+
+    KERNEL4x16_I1_L2_2  -2048,32, 0,0
+    KERNEL4x16_I1_L2_2  -2048,32, 1,0
+    KERNEL4x16_I1_L2_2  -2048,32, 2,0
+    KERNEL4x16_I1_L2_2  -2048,32, 3,0
+    KERNEL4x16_I1_L2_2  -2048,32, 4,0
+    KERNEL4x16_I1_L2_2  -2048,32, 5,0        
+    KERNEL4x16_I1_L2_2  -2048,32, 6,0
+    KERNEL4x16_I1_L2_2  -2048,32, 7,0  
+    KERNEL4x16_I1_L2_2  -2048,32, 8,0      
+    KERNEL4x16_I1_L2_2  -2048,32, 9,0
+    KERNEL4x16_I1_L2_2  -2048,32, 10,0
+    KERNEL4x16_I1_L2_2  -2048,32, 11,0
+    KERNEL4x16_I1_L2_2  -2048,32, 12,0
+    KERNEL4x16_I1_L2_2  -2048,32, 13,0    
+    KERNEL4x16_I1_L2_2  -2048,32, 14,0    
+    KERNEL4x16_I1_L2_3  -2048,32, 15,1    
+	b		LDGEMM_L4x16_SUB1
+
+
+	MY_ALIGN
+LDGEMM_L4x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	31
+#else
+	andi.		L,	K,	31
+#endif
+	KERNEL4x16 1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x16_SAVE
+	b		LDGEMM_L4x16_SUB2
+	MY_ALIGN
+LDGEMM_L4x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	31
+#else
+	andi.		L,	K,	31
+#endif	
+	ble		LDGEMM_L4x16_SAVE
+	MY_ALIGN
+LDGEMM_L4x16_SUB2:
+
+    andi.      T1,L, 16
+    ble LDGEMM_L4x16_SUB2_8
+	LOAD4x16_0 
+    KERNEL4x16_I1_L2_2  128,32, 0,0
+    KERNEL4x16_I1_L2_2  128,32, 1,0
+    KERNEL4x16_I1_L2_2  128,32, 2,0
+    KERNEL4x16_I1_L2_2  128,32, 3,0
+    KERNEL4x16_I1_L2_2  128,32, 4,0
+    KERNEL4x16_I1_L2_2  128,32, 5,0        
+    KERNEL4x16_I1_L2_2  128,32, 6,0
+    KERNEL4x16_I1_L2_3  128,32, 7,1 
+    MY_ALIGN
+LDGEMM_L4x16_SUB2_8:
+    andi.      T1,L, 8
+    ble LDGEMM_L4x16_SUB2_4
+	LOAD4x16_0
+    KERNEL4x16_I1_L2_2  128,32, 0,0
+    KERNEL4x16_I1_L2_2  128,32, 1,0
+    KERNEL4x16_I1_L2_2  128,32, 2,0
+    KERNEL4x16_I1_L2_3  128,32, 3,1
+	MY_ALIGN
+LDGEMM_L4x16_SUB2_4:
+    andi.      T1,L, 4
+    ble LDGEMM_L4x16_SUB2_2 
+	LOAD4x16_0
+    KERNEL4x16_I1_L2_2  128,32, 0,0
+    KERNEL4x16_I1_L2_3  128,32, 1,1
+	MY_ALIGN	
+LDGEMM_L4x16_SUB2_2:
+    andi.      T1,L, 2
+    ble LDGEMM_L4x16_SUB2_1
+    LOAD4x16_0
+    KERNEL4x16_I1_L2_3  128,32, 0,1
+    MY_ALIGN
+LDGEMM_L4x16_SUB2_1:
+    andi.      T1,L, 1
+    ble LDGEMM_L4x16_SAVE	
+    KERNEL4x16 0
+#	addic.		L,	L,	-1
+#	bgt		LDGEMM_L4x16_SUB2
+
+	MY_ALIGN
+LDGEMM_L4x16_SAVE:
+	SAVE4x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LDGEMM_L4x16_BEGIN
+
+LDGEMM_L4x16_END:
+
+LDGEMM_L4x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		LDGEMM_L4x1_END
+
+	andi.		T1,	M,	8
+	ble		LDGEMM_L4x8_END
+
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,8,4
+    srawi.		L, T3,	4	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	4	
+#endif		 
+ 
+
+	ble		LDGEMM_L4x8_SUB0
+
+LDGEMM_L4x8_LOOP_START:
+
+
+	LOAD4x8_1
+    ##OffsetA=64 OffsetB=32
+
+
+	addic.		L,	L,	-1
+
+	ble		LDGEMM_L4x8_LOOP_END
+
+    mtctr		L
+	MY_ALIGN
+
+LDGEMM_L4x8_LOOP:
+
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_2  64,32, 1,0
+    KERNEL4x8_I1_L2_2  64,32, 2,0
+    KERNEL4x8_I1_L2_2  64,32, 3,0
+    KERNEL4x8_I1_L2_2  64,32, 4,0
+    KERNEL4x8_I1_L2_2  64,32, 5,0        
+    KERNEL4x8_I1_L2_2  64,32, 6,0
+    KERNEL4x8_I1_L2_2  64,32, 7,1     
+
+	bdnz		LDGEMM_L4x8_LOOP
+	MY_ALIGN
+LDGEMM_L4x8_LOOP_END:
+
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_2  64,32, 1,0
+    KERNEL4x8_I1_L2_2  64,32, 2,0
+    KERNEL4x8_I1_L2_2  64,32, 3,0
+    KERNEL4x8_I1_L2_2  64,32, 4,0
+    KERNEL4x8_I1_L2_2  64,32, 5,0        
+    KERNEL4x8_I1_L2_2  64,32, 6,0
+    KERNEL4x8_I1_L2_3  64,32, 7,1  
+
+	b		LDGEMM_L4x8_SUB1
+	MY_ALIGN
+LDGEMM_L4x8_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	15
+#else
+	andi.		L,	K,	15
+#endif
+	KERNEL4x8 1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x8_SAVE
+	b		LDGEMM_L4x8_SUB2
+	MY_ALIGN
+LDGEMM_L4x8_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	15
+#else
+	andi.		L,	K,	15
+#endif	
+	ble		LDGEMM_L4x8_SAVE
+	MY_ALIGN
+LDGEMM_L4x8_SUB2:
+
+    andi.      T1,L, 8
+    ble LDGEMM_L4x8_SUB2_4
+	LOAD4x8_0
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_2  64,32, 1,0
+    KERNEL4x8_I1_L2_2  64,32, 2,0
+    KERNEL4x8_I1_L2_3  64,32, 3,1
+	MY_ALIGN
+LDGEMM_L4x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LDGEMM_L4x8_SUB2_2 
+	LOAD4x8_0
+    KERNEL4x8_I1_L2_2  64,32, 0,0
+    KERNEL4x8_I1_L2_3  64,32, 1,1
+	MY_ALIGN	
+LDGEMM_L4x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LDGEMM_L4x8_SUB2_1
+    LOAD4x8_0
+    KERNEL4x8_I1_L2_3  64,32, 0,1
+    MY_ALIGN
+LDGEMM_L4x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LDGEMM_L4x8_SAVE	
+    KERNEL4x8 0
+ 
+	MY_ALIGN
+LDGEMM_L4x8_SAVE:
+	SAVE4x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4
+#endif	
+LDGEMM_L4x8_END:
+
+LDGEMM_L4x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		LDGEMM_L4x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,4,4
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif		
+	ble		LDGEMM_L4x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L4x4_SUB4
+
+LDGEMM_L4x4_LOOP_START:
+
+	#dcbt	AO,	PRE
+	LOAD4x4_1
+	KERNEL4x4_I1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L4x4_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L4x4_LOOP:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	#dcbt	AO,	PRE
+	KERNEL4x4_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x4_LOOP
+
+LDGEMM_L4x4_LOOP_END:
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_2
+
+	KERNEL4x4_1
+	KERNEL4x4_2
+	KERNEL4x4_1
+	KERNEL4x4_E2
+
+	b		LDGEMM_L4x4_SUB1
+
+LDGEMM_L4x4_SUB4:
+
+	KERNEL4x4_SUBI1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+	KERNEL4x4_SUB1
+
+	b		LDGEMM_L4x4_SUB1
+
+LDGEMM_L4x4_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL4x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x4_SAVE
+	b		LDGEMM_L4x4_SUB2
+
+LDGEMM_L4x4_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L4x4_SAVE
+
+LDGEMM_L4x4_SUB2:
+
+	KERNEL4x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x4_SUB2
+
+LDGEMM_L4x4_SAVE:
+
+	SAVE4x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4
+#endif	
+LDGEMM_L4x4_END:
+
+LDGEMM_L4x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		LDGEMM_L4x2_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,2,4
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L4x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L4x2_SUB4
+
+LDGEMM_L4x2_LOOP_START:
+
+	LOAD4x2_1
+	KERNEL4x2_I1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L4x2_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L4x2_LOOP:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x2_LOOP
+
+LDGEMM_L4x2_LOOP_END:
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_2
+
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_1
+	KERNEL4x2_E2
+
+	b		LDGEMM_L4x2_SUB1
+
+LDGEMM_L4x2_SUB4:
+
+	KERNEL4x2_SUBI1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+	KERNEL4x2_SUB1
+
+	b		LDGEMM_L4x2_SUB1
+
+LDGEMM_L4x2_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL4x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x2_SAVE
+	b		LDGEMM_L4x2_SUB2
+
+LDGEMM_L4x2_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L4x2_SAVE
+
+LDGEMM_L4x2_SUB2:
+
+	KERNEL4x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x2_SUB2
+
+LDGEMM_L4x2_SAVE:
+
+	SAVE4x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4
+#endif	
+LDGEMM_L4x2_END:
+
+LDGEMM_L4x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		LDGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+    REFRESH_TEMP_BK T3,K,TEMP_REG,1,4
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L4x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L4x1_SUB4
+
+LDGEMM_L4x1_LOOP_START:
+
+	LOAD4x1_1
+	KERNEL4x1_I1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L4x1_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L4x1_LOOP:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x1_LOOP
+
+LDGEMM_L4x1_LOOP_END:
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_2
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_1
+	KERNEL4x1_E2
+
+	b		LDGEMM_L4x1_SUB1
+
+LDGEMM_L4x1_SUB4:
+
+	KERNEL4x1_SUBI1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+	KERNEL4x1_SUB1
+
+	b		LDGEMM_L4x1_SUB1
+
+LDGEMM_L4x1_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL4x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L4x1_SAVE
+	b		LDGEMM_L4x1_SUB2
+
+LDGEMM_L4x1_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L4x1_SAVE
+
+LDGEMM_L4x1_SUB2:
+
+	KERNEL4x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L4x1_SUB2
+
+LDGEMM_L4x1_SAVE:
+
+	SAVE4x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4
+#endif	
+LDGEMM_L4x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 4
+#endif
+	addic.		J,	J,	-1
+	bgt		LDGEMM_L4_BEGIN
+
+	andi.		T2,	N,	3
+	ble		.L999
+
+LDGEMM_L4_END:
+
+	b		LDGEMM_L2_BEGIN
+
+.L999_H1:
+
+	b		.L999
+
+LDGEMM_L2_BEGIN:
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	andi.		T1,	N,	2
+	ble		LDGEMM_L2_END
+	mr		CO,	C
+	mr		AO,	A
+	slwi		T1,	LDC	,	1
+	add		C,	C,	T1
+	srawi.		I,	M,	4
+	ble		LDGEMM_L2x16_END
+
+LDGEMM_L2x16_BEGIN:
+
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,16,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x16_SUB4
+
+LDGEMM_L2x16_LOOP_START:
+
+	#dcbt		AO,	PRE
+	LOAD2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_I1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x16_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x16_LOOP:
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x16_LOOP
+
+LDGEMM_L2x16_LOOP_END:
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	#dcbt		AO,	PRE
+	KERNEL2x16_2
+	#dcbt		AO,	PRE
+	KERNEL2x16_1
+	KERNEL2x16_E2
+
+	b		LDGEMM_L2x16_SUB1
+
+LDGEMM_L2x16_SUB4:
+
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUBI1
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL2x16_SUB1
+
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+	KERNEL2x16_SUB1
+
+	b		LDGEMM_L2x16_SUB1
+
+LDGEMM_L2x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x16_SAVE
+	b		LDGEMM_L2x16_SUB2
+
+LDGEMM_L2x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x16_SAVE
+
+LDGEMM_L2x16_SUB2:
+
+	KERNEL2x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x16_SUB2
+
+LDGEMM_L2x16_SAVE:
+
+	SAVE2x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2
+#endif	
+	addic.		I,	I,	-1
+	bgt		LDGEMM_L2x16_BEGIN
+
+LDGEMM_L2x16_END:
+
+LDGEMM_L2x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		LDGEMM_L2x1_END
+
+	andi.		T1,	M,	8
+	ble		LDGEMM_L2x8_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,8,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x8_SUB4
+
+LDGEMM_L2x8_LOOP_START:
+
+	#dcbt	AO,	PRE
+	LOAD2x8_1
+	KERNEL2x8_I1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x8_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x8_LOOP:
+
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+	KERNEL2x8_1
+	#dcbt	AO,	PRE
+	KERNEL2x8_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x8_LOOP
+
+LDGEMM_L2x8_LOOP_END:
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_2
+
+	KERNEL2x8_1
+	KERNEL2x8_2
+	KERNEL2x8_1
+	KERNEL2x8_E2
+
+	b		LDGEMM_L2x8_SUB1
+
+LDGEMM_L2x8_SUB4:
+
+	KERNEL2x8_SUBI1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+	KERNEL2x8_SUB1
+
+	b		LDGEMM_L2x8_SUB1
+
+LDGEMM_L2x8_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x8_SAVE
+	b		LDGEMM_L2x8_SUB2
+
+LDGEMM_L2x8_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x8_SAVE
+
+LDGEMM_L2x8_SUB2:
+
+	KERNEL2x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x8_SUB2
+
+LDGEMM_L2x8_SAVE:
+
+	SAVE2x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2
+#endif
+LDGEMM_L2x8_END:
+
+LDGEMM_L2x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		LDGEMM_L2x4_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,4,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x4_SUB4
+
+LDGEMM_L2x4_LOOP_START:
+
+	LOAD2x4_1
+	KERNEL2x4_I1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x4_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x4_LOOP:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x4_LOOP
+
+LDGEMM_L2x4_LOOP_END:
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_2
+
+	KERNEL2x4_1
+	KERNEL2x4_2
+	KERNEL2x4_1
+	KERNEL2x4_E2
+
+	b		LDGEMM_L2x4_SUB1
+
+LDGEMM_L2x4_SUB4:
+
+	KERNEL2x4_SUBI1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+	KERNEL2x4_SUB1
+
+	b		LDGEMM_L2x4_SUB1
+
+LDGEMM_L2x4_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x4_SAVE
+	b		LDGEMM_L2x4_SUB2
+
+LDGEMM_L2x4_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x4_SAVE
+
+LDGEMM_L2x4_SUB2:
+
+	KERNEL2x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x4_SUB2
+
+LDGEMM_L2x4_SAVE:
+
+	SAVE2x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2
+#endif
+LDGEMM_L2x4_END:
+
+LDGEMM_L2x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		LDGEMM_L2x2_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,2,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x2_SUB4
+
+LDGEMM_L2x2_LOOP_START:
+
+	LOAD2x2_1
+	KERNEL2x2_I1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x2_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x2_LOOP:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x2_LOOP
+
+LDGEMM_L2x2_LOOP_END:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_2
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_1
+	KERNEL2x2_E2
+
+	b		LDGEMM_L2x2_SUB1
+
+LDGEMM_L2x2_SUB4:
+
+	KERNEL2x2_SUBI1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+	KERNEL2x2_SUB1
+
+	b		LDGEMM_L2x2_SUB1
+
+LDGEMM_L2x2_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x2_SAVE
+	b		LDGEMM_L2x2_SUB2
+
+LDGEMM_L2x2_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x2_SAVE
+
+LDGEMM_L2x2_SUB2:
+
+	KERNEL2x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x2_SUB2
+
+LDGEMM_L2x2_SAVE:
+
+	SAVE2x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2
+#endif
+LDGEMM_L2x2_END:
+
+LDGEMM_L2x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		LDGEMM_L2x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+    REFRESH_TEMP_BK T3,K,TEMP_REG,1,2
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L2x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L2x1_SUB4
+
+LDGEMM_L2x1_LOOP_START:
+
+	LOAD2x1_1
+	KERNEL2x1_I1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L2x1_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L2x1_LOOP:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x1_LOOP
+
+LDGEMM_L2x1_LOOP_END:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_2
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_1
+	KERNEL2x1_E2
+
+	b		LDGEMM_L2x1_SUB1
+
+LDGEMM_L2x1_SUB4:
+
+	KERNEL2x1_SUBI1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+	KERNEL2x1_SUB1
+
+	b		LDGEMM_L2x1_SUB1
+
+LDGEMM_L2x1_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL2x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L2x1_SAVE
+	b		LDGEMM_L2x1_SUB2
+
+LDGEMM_L2x1_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L2x1_SAVE
+
+LDGEMM_L2x1_SUB2:
+
+	KERNEL2x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L2x1_SUB2
+
+LDGEMM_L2x1_SAVE:
+
+	SAVE2x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2
+#endif
+LDGEMM_L2x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 2
+#endif
+LDGEMM_L2_END:
+LDGEMM_L1_BEGIN:
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	andi.		T1,	N,	1
+	ble		LDGEMM_L1_END
+	mr		CO,	C
+	mr		AO,	A
+	srawi.		I,	M,	4
+	ble		LDGEMM_L1x16_END
+
+LDGEMM_L1x16_BEGIN:
+
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,16,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x16_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x16_SUB4
+
+LDGEMM_L1x16_LOOP_START:
+
+	#dcbt		AO,	PRE
+	LOAD1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_I1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x16_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x16_LOOP:
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x16_LOOP
+
+LDGEMM_L1x16_LOOP_END:
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	#dcbt		AO,	PRE
+	KERNEL1x16_2
+	#dcbt		AO,	PRE
+	KERNEL1x16_1
+	KERNEL1x16_E2
+
+	b		LDGEMM_L1x16_SUB1
+
+LDGEMM_L1x16_SUB4:
+
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUBI1
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+	#dcbt		AO,	PRE
+	KERNEL1x16_SUB1
+
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+	KERNEL1x16_SUB1
+
+	b		LDGEMM_L1x16_SUB1
+
+LDGEMM_L1x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x16_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x16_SAVE
+	b		LDGEMM_L1x16_SUB2
+
+LDGEMM_L1x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x16_SAVE
+
+LDGEMM_L1x16_SUB2:
+
+	KERNEL1x16_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x16_SUB2
+
+LDGEMM_L1x16_SAVE:
+
+	SAVE1x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1
+#endif
+	addic.		I,	I,	-1
+	bgt		LDGEMM_L1x16_BEGIN
+
+LDGEMM_L1x16_END:
+
+LDGEMM_L1x8_BEGIN:
+
+	andi.		T2,	M,	15
+	ble		LDGEMM_L1x1_END
+
+	andi.		T1,	M,	8
+	ble		LDGEMM_L1x8_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,8,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x8_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x8_SUB4
+
+LDGEMM_L1x8_LOOP_START:
+
+	#dcbt	AO,	PRE
+	LOAD1x8_1
+	KERNEL1x8_I1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x8_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x8_LOOP:
+
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+	KERNEL1x8_1
+	#dcbt	AO,	PRE
+	KERNEL1x8_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x8_LOOP
+
+LDGEMM_L1x8_LOOP_END:
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_2
+
+	KERNEL1x8_1
+	KERNEL1x8_2
+	KERNEL1x8_1
+	KERNEL1x8_E2
+
+	b		LDGEMM_L1x8_SUB1
+
+LDGEMM_L1x8_SUB4:
+
+	KERNEL1x8_SUBI1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+	KERNEL1x8_SUB1
+
+	b		LDGEMM_L1x8_SUB1
+
+LDGEMM_L1x8_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x8_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x8_SAVE
+	b		LDGEMM_L1x8_SUB2
+
+LDGEMM_L1x8_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x8_SAVE
+
+LDGEMM_L1x8_SUB2:
+
+	KERNEL1x8_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x8_SUB2
+
+LDGEMM_L1x8_SAVE:
+
+	SAVE1x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1
+#endif
+LDGEMM_L1x8_END:
+
+LDGEMM_L1x4_BEGIN:
+
+
+	andi.		T1,	M,	4
+	ble		LDGEMM_L1x4_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,4,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x4_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x4_SUB4
+
+LDGEMM_L1x4_LOOP_START:
+
+	LOAD1x4_1
+	KERNEL1x4_I1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x4_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x4_LOOP:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x4_LOOP
+
+LDGEMM_L1x4_LOOP_END:
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_2
+
+	KERNEL1x4_1
+	KERNEL1x4_2
+	KERNEL1x4_1
+	KERNEL1x4_E2
+
+	b		LDGEMM_L1x4_SUB1
+
+LDGEMM_L1x4_SUB4:
+
+	KERNEL1x4_SUBI1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+	KERNEL1x4_SUB1
+
+	b		LDGEMM_L1x4_SUB1
+
+LDGEMM_L1x4_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x4_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x4_SAVE
+	b		LDGEMM_L1x4_SUB2
+
+LDGEMM_L1x4_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x4_SAVE
+
+LDGEMM_L1x4_SUB2:
+
+	KERNEL1x4_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x4_SUB2
+
+LDGEMM_L1x4_SAVE:
+
+	SAVE1x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1
+#endif
+LDGEMM_L1x4_END:
+
+LDGEMM_L1x2_BEGIN:
+
+
+	andi.		T1,	M,	2
+	ble		LDGEMM_L1x2_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,2,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x2_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x2_SUB4
+
+LDGEMM_L1x2_LOOP_START:
+
+	LOAD1x2_1
+	KERNEL1x2_I1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x2_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x2_LOOP:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x2_LOOP
+
+LDGEMM_L1x2_LOOP_END:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_2
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_1
+	KERNEL1x2_E2
+
+	b		LDGEMM_L1x2_SUB1
+
+LDGEMM_L1x2_SUB4:
+
+	KERNEL1x2_SUBI1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+	KERNEL1x2_SUB1
+
+	b		LDGEMM_L1x2_SUB1
+
+LDGEMM_L1x2_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x2_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x2_SAVE
+	b		LDGEMM_L1x2_SUB2
+
+LDGEMM_L1x2_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x2_SAVE
+
+LDGEMM_L1x2_SUB2:
+
+	KERNEL1x2_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x2_SUB2
+
+LDGEMM_L1x2_SAVE:
+
+	SAVE1x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1
+#endif
+LDGEMM_L1x2_END:
+
+LDGEMM_L1x1_BEGIN:
+
+
+	andi.		T1,	M,	1
+	ble		LDGEMM_L1x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+    REFRESH_TEMP_BK T3,K,TEMP_REG,1,1
+    srawi.		L, T3,	3	
+#else
+	mr		BO,	B
+	srawi.		L,	K,	3	
+#endif	
+	ble		LDGEMM_L1x1_SUB0
+	cmpwi		cr0,	L,	1
+	ble		LDGEMM_L1x1_SUB4
+
+LDGEMM_L1x1_LOOP_START:
+
+	LOAD1x1_1
+	KERNEL1x1_I1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-2
+	ble		LDGEMM_L1x1_LOOP_END
+
+	MY_ALIGN
+
+LDGEMM_L1x1_LOOP:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x1_LOOP
+
+LDGEMM_L1x1_LOOP_END:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_2
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_1
+	KERNEL1x1_E2
+
+	b		LDGEMM_L1x1_SUB1
+
+LDGEMM_L1x1_SUB4:
+
+	KERNEL1x1_SUBI1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+	KERNEL1x1_SUB1
+
+	b		LDGEMM_L1x1_SUB1
+
+LDGEMM_L1x1_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+
+	KERNEL1x1_SUBI1
+
+	addic.		L,	L,	-1
+	ble		LDGEMM_L1x1_SAVE
+	b		LDGEMM_L1x1_SUB2
+
+LDGEMM_L1x1_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T3,	7
+#else
+	andi.		L,	K,	7
+#endif
+	ble		LDGEMM_L1x1_SAVE
+
+LDGEMM_L1x1_SUB2:
+
+	KERNEL1x1_SUB1
+
+	addic.		L,	L,	-1
+	bgt		LDGEMM_L1x1_SUB2
+
+LDGEMM_L1x1_SAVE:
+
+	SAVE1x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1
+#endif
+LDGEMM_L1x1_END:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 1
+#endif
+LDGEMM_L1_END:
diff --git a/kernel/power/dgemm_macros_power9.S b/kernel/power/dgemm_macros_power9.S
index c4b8270b8..4eddab24f 100644
--- a/kernel/power/dgemm_macros_power9.S
+++ b/kernel/power/dgemm_macros_power9.S
@@ -1,3623 +1,3623 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/**************************************************************************************
-* Abdelrauf(quickwritereader@googlemail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-
-/*********************************************************************
-* Macros for N=4, M=16                                               *
-*********************************************************************/
-.macro LOAD4x16_1
-   LOAD4x16 1
-.endm
-
-.macro LOAD4x16_0
-   LOAD4x16 0
-.endm
-.macro LOAD4x16  Zero
-
-	lxv	vs24,	0(BO)
-	lxv	vs26,	16(BO)
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-
-	lxv	vs0,	 0(AO)
-	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO)
- 
-
-	lxv	vs4,	64(AO)
-	lxv	vs5,	80(AO)
-	lxv	vs6,	96(AO)
-	lxv	vs7,	112(AO)
-.if \Zero==1 
-    xxlxor		vs32,vs32,vs32
-    xxlxor		vs33,vs33,vs33
-	xxlxor		vs34,vs34,vs34
-	xxlxor		vs35,vs35,vs35
-	xxlxor		vs36,vs36,vs36
-	xxlxor		vs37,vs37,vs37
-	xxlxor		vs38,vs38,vs38
-	xxlxor		vs39,vs39,vs39
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-	xxlxor		vs44,	vs44,	vs44
-	xxlxor		vs45,	vs45,	vs45
-	xxlxor		vs46,	vs46,	vs46
-	xxlxor		vs47,	vs47,	vs47
-	xxlxor		vs48,	vs48,	vs48
-	xxlxor		vs49,	vs49,	vs49
-	xxlxor		vs50,	vs50,	vs50
-	xxlxor		vs51,	vs51,	vs51 
-	xxlxor		vs52,	vs52,	vs52
-	xxlxor		vs53,	vs53,	vs53
-	xxlxor		vs54,	vs54,	vs54
-	xxlxor		vs55,	vs55,	vs55 
-	xxlxor		vs56,	vs56,	vs56
-	xxlxor		vs57,	vs57,	vs57
-	xxlxor		vs58,	vs58,	vs58
-	xxlxor		vs59,	vs59,	vs59 
-	xxlxor		vs60,	vs60,	vs60
-	xxlxor		vs61,	vs61,	vs61
-	xxlxor		vs62,	vs62,	vs62
-	xxlxor		vs63,	vs63,	vs63	
-.endif
-.endm
-
-  
-#define unit_size 8
-#define DISP32(ind,disp) (ind*unit_size*32+disp)
-#define DISP16(ind,disp) (ind*unit_size*16+disp)
-#define DISP8(ind,disp) (ind*unit_size*8+disp)
-#define DISP4(ind,disp) (ind*unit_size*4+disp)
-#define DISP2(ind,disp) (ind*unit_size*2+disp)
-#define DISP1(ind,disp) (ind*unit_size+disp)
-
-.macro KERNEL4x16_L1_L2  Index,IsLast
-  KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0
-.endm
-
-
-
-.macro KERNEL4x16_I1_L2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I  AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I1_L2_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x16_I2_L2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I  \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I2_L2_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I  \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I2_L2_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x16_L1_L2_I  AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete
-
-.if \First ==1
-	xvmuldp		vs32,	vs0,	vs24
-	xvmuldp		vs33,	vs1,	vs24
-	xvmuldp		vs34,	vs2,	vs24
-	xvmuldp		vs35,	vs3,	vs24
-.else
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-.endif
-	lxv	vs8,	DISP32(\Index,0+\OffsetA)(\AREG)
-	lxv	vs9,	DISP32(\Index,16+\OffsetA)(\AREG)
-	lxv	vs10,	DISP32(\Index,32+\OffsetA)(\AREG)
-	lxv	vs11,	DISP32(\Index,48+\OffsetA)(\AREG)
-.if \First ==1
-	xvmuldp		vs36,	vs4,	vs24
-	xvmuldp		vs37,	vs5,	vs24
-	xvmuldp		vs38,	vs6,	vs24
-	xvmuldp		vs39,	vs7,	vs24
-.else
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-.endif
-	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(\BREG)
-	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(\BREG)
-	xxpermdi	vs29,	vs28,	vs28,2	
-	xxpermdi	vs31,	vs30,	vs30,2
-.if \First ==1
-	xvmuldp		vs40,	vs0,	vs25
-	xvmuldp		vs41,	vs1,	vs25
-	xvmuldp		vs42,	vs2,	vs25
-	xvmuldp		vs43,	vs3,	vs25
-
-
-	xvmuldp		vs44,	vs4,	vs25
-	xvmuldp		vs45,	vs5,	vs25
-	xvmuldp		vs46,	vs6,	vs25
-	xvmuldp		vs47,	vs7,	vs25
-
-
-	xvmuldp		vs48,	vs0,	vs26
-	xvmuldp		vs49,	vs1,	vs26
-	xvmuldp		vs50,	vs2,	vs26
-	xvmuldp		vs51,	vs3,	vs26
-
-
-.else
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-
-
-	xvmaddadp		vs44,	vs4,	vs25
-	xvmaddadp		vs45,	vs5,	vs25
-	xvmaddadp		vs46,	vs6,	vs25
-	xvmaddadp		vs47,	vs7,	vs25
-
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-	xvmaddadp		vs50,	vs2,	vs26
-	xvmaddadp		vs51,	vs3,	vs26
-
-.endif
-	lxv	vs12,  DISP32(\Index,64+\OffsetA)(\AREG)
-	lxv	vs13,  DISP32(\Index,80+\OffsetA)(\AREG)
-.if \First ==1
-	xvmuldp		vs52,	vs4,	vs26
-	xvmuldp		vs53,	vs5,	vs26
-	xvmuldp		vs54,	vs6,	vs26
-	xvmuldp		vs55,	vs7,	vs26
-
-.else
-	xvmaddadp		vs52,	vs4,	vs26
-	xvmaddadp		vs53,	vs5,	vs26
-	xvmaddadp		vs54,	vs6,	vs26
-	xvmaddadp		vs55,	vs7,	vs26
-.endif
-	lxv	vs14,  DISP32(\Index,96+\OffsetA)(\AREG)
-	lxv	vs15,  DISP32(\Index,112+\OffsetA)(\AREG)
-.if \First ==1
-	xvmuldp		vs56,	vs0,	vs27
-	xvmuldp		vs57,	vs1,	vs27
-	xvmuldp		vs58,	vs2,	vs27
-	xvmuldp		vs59,	vs3,	vs27
-
- 
-
-	xvmuldp		vs60,	vs4,	vs27
-	xvmuldp		vs61,	vs5,	vs27
-	xvmuldp		vs62,	vs6,	vs27
-	xvmuldp		vs63,	vs7,	vs27
-
-.else
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-	xvmaddadp		vs58,	vs2,	vs27
-	xvmaddadp		vs59,	vs3,	vs27
-
- 
-
-	xvmaddadp		vs60,	vs4,	vs27
-	xvmaddadp		vs61,	vs5,	vs27
-	xvmaddadp		vs62,	vs6,	vs27
-	xvmaddadp		vs63,	vs7,	vs27
-.endif
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-.if \Complete==0
-	lxv	vs0,	DISP32(\Index,128+\OffsetA)(\AREG)
-	lxv	vs1,	DISP32(\Index,144+\OffsetA)(\AREG)
-.endif
-	xvmaddadp		vs36,	vs12,	vs28
-	xvmaddadp		vs37,	vs13,	vs28
-	xvmaddadp		vs38,	vs14,	vs28
-	xvmaddadp		vs39,	vs15,	vs28
-.if \Complete==0
-	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(\BREG)
-	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(\BREG)
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-.endif
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-.if \Complete==0
-	lxv	vs2,	DISP32(\Index,160+\OffsetA)(\AREG)
-	lxv	vs3,	DISP32(\Index,176+\OffsetA)(\AREG)
-.endif
-	xvmaddadp		vs44,	vs12,	vs29
-	xvmaddadp		vs45,	vs13,	vs29
-	xvmaddadp		vs46,	vs14,	vs29
-	xvmaddadp		vs47,	vs15,	vs29
-
-
-	xvmaddadp		vs48,	vs8,	vs30
-	xvmaddadp		vs49,	vs9,	vs30
-	xvmaddadp		vs50,	vs10,	vs30
-	xvmaddadp		vs51,	vs11,	vs30
-.if \Complete==0
-	lxv	vs4,	DISP32(\Index,192+\OffsetA)(\AREG)
-	lxv	vs5,	DISP32(\Index,208+\OffsetA)(\AREG)
-.endif
-	xvmaddadp		vs52,	vs12,	vs30
-	xvmaddadp		vs53,	vs13,	vs30
-	xvmaddadp		vs54,	vs14,	vs30
-	xvmaddadp		vs55,	vs15,	vs30
-.if \Complete==0
-	lxv	vs6,	DISP32(\Index,224+\OffsetA)(\AREG)
-	lxv	vs7,	DISP32(\Index,240+\OffsetA)(\AREG)
-.endif
-	xvmaddadp		vs56,	vs8,	vs31
-	xvmaddadp		vs57,	vs9,	vs31
-	xvmaddadp		vs58,	vs10,	vs31
-	xvmaddadp		vs59,	vs11,	vs31
- 
-
-	xvmaddadp		vs60,	vs12,	vs31
-	
-	xvmaddadp		vs61,	vs13,	vs31
-	xvmaddadp		vs62,	vs14,	vs31
-	
-	xvmaddadp		vs63,	vs15,	vs31
-  .if \IsLast==1	
-  .if \Complete==1
-	addi		\AREG, \AREG, DISP32(\Index,128+\OffsetA)
-	addi		\BREG, \BREG,  DISP8(\Index,32+\OffsetB)
-  .else
-	addi		\AREG, \AREG, DISP32(\Index,256)
-	addi		\BREG, \BREG,  DISP8(\Index,64)
-  .endif
-  .endif
-  
-
-.endm
-
- 
-
-.macro KERNEL4x16 First
-
-	lxv	vs24,	0(BO)
-	lxv	vs26,	16(BO)
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-
-	lxv	vs0,	0(AO)
-	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO) 
-
-	lxv	vs4,	64(AO)
-	lxv	vs5,	80(AO)
-	lxv	vs6,	96(AO)
-	lxv	vs7,	112(AO)
-
-
- 
-	addi		BO, BO, 32
-  addi		AO, AO, 128
-
-.if \First==1
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-	xvmuldp			vs36,	vs4,	vs24
-	xvmuldp			vs37,	vs5,	vs24
-	xvmuldp			vs38,	vs6,	vs24
-	xvmuldp			vs39,	vs7,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
-	xvmuldp			vs44,	vs4,	vs25
-	xvmuldp			vs45,	vs5,	vs25
-	xvmuldp			vs46,	vs6,	vs25
-	xvmuldp			vs47,	vs7,	vs25
-
-	xvmuldp			vs48,	vs0,	vs26
-	xvmuldp			vs49,	vs1,	vs26
-	xvmuldp			vs50,	vs2,	vs26
-	xvmuldp			vs51,	vs3,	vs26
-	xvmuldp			vs52,	vs4,	vs26
-	xvmuldp			vs53,	vs5,	vs26
-	xvmuldp			vs54,	vs6,	vs26
-	xvmuldp			vs55,	vs7,	vs26
-
-	xvmuldp			vs56,	vs0,	vs27
-	xvmuldp			vs57,	vs1,	vs27
-	xvmuldp			vs58,	vs2,	vs27
-	xvmuldp			vs59,	vs3,	vs27
-	xvmuldp			vs60,	vs4,	vs27
-	xvmuldp			vs61,	vs5,	vs27
-	xvmuldp			vs62,	vs6,	vs27
-	xvmuldp			vs63,	vs7,	vs27
-.else
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
- 
-	xvmaddadp		vs44,	vs4,	vs25
-	xvmaddadp		vs45,	vs5,	vs25
-	xvmaddadp		vs46,	vs6,	vs25
-	xvmaddadp		vs47,	vs7,	vs25
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-	xvmaddadp		vs50,	vs2,	vs26
-	xvmaddadp		vs51,	vs3,	vs26
- 
-	xvmaddadp		vs52,	vs4,	vs26
-	xvmaddadp		vs53,	vs5,	vs26
-	xvmaddadp		vs54,	vs6,	vs26
-	xvmaddadp		vs55,	vs7,	vs26
-
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-	xvmaddadp		vs58,	vs2,	vs27
-	xvmaddadp		vs59,	vs3,	vs27
-	xvmaddadp		vs60,	vs4,	vs27
-	xvmaddadp		vs61,	vs5,	vs27
-	xvmaddadp		vs62,	vs6,	vs27
-	xvmaddadp		vs63,	vs7,	vs27
-
-.endif
-.endm
-
-.macro SAVE4x16_REGS
-	add		C2,	CO,	LDC
-	add		C3,	C2,	LDC
-	add		C4,	C3,	LDC
-.endm
-
-.macro SAVE4x16
-#ifndef TRMMKERNEL
-	lxv		vs0,	0(CO)
-	lxv		vs2,	16(CO)
-	lxv		vs4,	32(CO)
-	lxv		vs6,	48(CO)
-#endif	
-	xxpermdi  vs8, vs40,vs32,1
- 	xxpermdi  vs9 ,vs32,vs40,1
-#ifndef TRMMKERNEL
-	lxv		vs24,	64(CO)
-	lxv		vs26,	80(CO)
-	lxv		vs28,	96(CO)
-	lxv		vs30,	112(CO)
-#endif	
-	xxpermdi  vs10, vs41,vs33,1		 
- 	xxpermdi  vs11 ,vs33,vs41,1
-#ifndef TRMMKERNEL	 
-	lxv		vs1,	0(C2)
-	lxv		vs3,	16(C2)
-	lxv		vs5,	32(C2)
-	lxv		vs7,	48(C2)
-#endif	
-	xxpermdi  vs12, vs42,vs34,1
- 	xxpermdi  vs13 ,vs34,vs42,1
-#ifndef TRMMKERNEL
-	lxv		vs25,	64(C2)
-	lxv		vs27,	80(C2)
-#endif	
-	xxpermdi  vs14, vs43,vs35,1		 
- 	xxpermdi  vs15 ,vs35,vs43,1	
-#ifndef TRMMKERNEL	 
-	lxv		vs29,	96(C2)
-	lxv		vs31,	112(C2)	
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs8,	alpha_r 
-	xvmaddadp	vs1,	vs9,	alpha_r 
-	xvmaddadp	vs2,	vs10,	alpha_r 
-	xvmaddadp	vs3,	vs11,	alpha_r 
-#else
-	xvmuldp	vs0,	vs8,	alpha_r 
-	xvmuldp	vs1,	vs9,	alpha_r 
-	xvmuldp	vs2,	vs10,	alpha_r 
-	xvmuldp	vs3,	vs11,	alpha_r 
-
-#endif
-	xxpermdi  vs8, vs44,vs36,1
- 	xxpermdi  vs9 ,vs36,vs44,1
-	xxpermdi  vs10, vs45,vs37,1		 
- 	xxpermdi  vs11 ,vs37,vs45,1
-#ifndef TRMMKERNEL
-	xvmaddadp	vs4,	vs12,	alpha_r 
-	xvmaddadp	vs5,	vs13,	alpha_r 
-	xvmaddadp	vs6,	vs14,	alpha_r 
-	xvmaddadp	vs7,	vs15,	alpha_r 
-#else
-	xvmuldp	vs4,	vs12,	alpha_r 
-	xvmuldp	vs5,	vs13,	alpha_r 
-	xvmuldp	vs6,	vs14,	alpha_r 
-	xvmuldp	vs7,	vs15,	alpha_r 
-#endif
-	xxpermdi  vs12, vs46,vs38,1
- 	xxpermdi  vs13 ,vs38,vs46,1
-	xxpermdi  vs14, vs47,vs39,1		 
- 	xxpermdi  vs15 ,vs39,vs47,1
-
-#ifndef TRMMKERNEL 
-	xvmaddadp	vs24,	vs8,	alpha_r 
-	xvmaddadp	vs25,	vs9,	alpha_r 
-	xvmaddadp	vs26,	vs10,	alpha_r 
-	xvmaddadp	vs27,	vs11,	alpha_r 
-
-	xvmaddadp	vs28,	vs12,	alpha_r 
-	xvmaddadp	vs29,	vs13,	alpha_r 
-	xvmaddadp	vs30,	vs14,	alpha_r 
-	xvmaddadp	vs31,	vs15,	alpha_r 
-#else
-	xvmuldp	vs24,	vs8,	alpha_r 
-	xvmuldp	vs25,	vs9,	alpha_r 
-	xvmuldp	vs26,	vs10,	alpha_r 
-	xvmuldp	vs27,	vs11,	alpha_r 
-
-	xvmuldp	vs28,	vs12,	alpha_r 
-	xvmuldp	vs29,	vs13,	alpha_r 
-	xvmuldp	vs30,	vs14,	alpha_r 
-	xvmuldp	vs31,	vs15,	alpha_r 
-
-#endif
-	stxv		vs0,	0(CO)
-	stxv		vs2,	16(CO)
-	stxv		vs4,	32(CO)
-	stxv		vs6,	48(CO)
-
-	stxv		vs24,	64(CO)
-	stxv		vs26,	80(CO)
-	stxv		vs28,	96(CO)
-	stxv		vs30,	112(CO)
-
-	stxv		vs1,	0(C2)
-	stxv		vs3,	16(C2)
-	stxv		vs5,	32(C2)
-	stxv		vs7,	48(C2)
-	
-	stxv		vs25,	64(C2)
-	stxv		vs27,	80(C2)
-	stxv		vs29,	96(C2)
-	stxv		vs31,	112(C2)	
-#ifndef TRMMKERNEL
- 	lxv		vs0,	0(C3)
-	lxv		vs2,	16(C3)
-	lxv		vs4,	32(C3)
-	lxv		vs6,	48(C3)
-#endif	
-	xxpermdi  vs8, vs56,vs48,1
- 	xxpermdi  vs9 ,vs48,vs56,1
-#ifndef TRMMKERNEL	 
-	lxv		vs24,	64(C3)
-	lxv		vs26,	80(C3)
-#endif	
-	xxpermdi  vs10, vs57,vs49,1		 
- 	xxpermdi  vs11 ,vs49,vs57,1	
-#ifndef TRMMKERNEL	 
-	lxv		vs28,	96(C3)
-	lxv		vs30,	112(C3)
-#endif	
-	xxpermdi  vs12, vs58,vs50,1
- 	xxpermdi  vs13 ,vs50,vs58,1
-#ifndef TRMMKERNEL	 
-	lxv		vs1,	0(C4)
-	lxv		vs3,	16(C4)
-#endif	
-	xxpermdi  vs14, vs59,vs51,1		 
- 	xxpermdi  vs15 ,vs51,vs59,1	
-#ifndef TRMMKERNEL	 
-	lxv		vs5,	32(C4)
-	lxv		vs7,	48(C4)
-
-	lxv		vs25,	64(C4)
-	lxv		vs27,	80(C4)
-	lxv		vs29,	96(C4)
-	lxv		vs31,	112(C4)	
-#endif
- 
-#ifndef TRMMKERNEL 
-	xvmaddadp	vs0,	vs8,	alpha_r 
-	xvmaddadp	vs1,	vs9,	alpha_r 
-	xvmaddadp	vs2,	vs10,	alpha_r 
-	xvmaddadp	vs3,	vs11,	alpha_r 
-#else
-	xvmuldp	vs0,	vs8,	alpha_r 
-	xvmuldp	vs1,	vs9,	alpha_r 
-	xvmuldp	vs2,	vs10,	alpha_r 
-	xvmuldp	vs3,	vs11,	alpha_r 
-
-#endif
-
-	xxpermdi  vs8, vs60,vs52,1
- 	xxpermdi  vs9 ,vs52,vs60,1
-	xxpermdi  vs10, vs61,vs53,1		 
- 	xxpermdi  vs11 ,vs53,vs61,1
-#ifndef TRMMKERNEL
-	xvmaddadp	vs4,	vs12,	alpha_r 
-	xvmaddadp	vs5,	vs13,	alpha_r 
-	xvmaddadp	vs6,	vs14,	alpha_r 
-	xvmaddadp	vs7,	vs15,	alpha_r 
-#else
-	xvmuldp	vs4,	vs12,	alpha_r 
-	xvmuldp	vs5,	vs13,	alpha_r 
-	xvmuldp	vs6,	vs14,	alpha_r 
-	xvmuldp	vs7,	vs15,	alpha_r 
-#endif
-
-
-	xxpermdi  vs12, vs62,vs54,1
- 	xxpermdi  vs13 ,vs54,vs62,1
-	xxpermdi  vs14, vs63,vs55,1		 
- 	xxpermdi  vs15 ,vs55,vs63,1
-#ifndef TRMMKERNEL 
-	xvmaddadp	vs24,	vs8,	alpha_r 
-	xvmaddadp	vs25,	vs9,	alpha_r 
-	xvmaddadp	vs26,	vs10,	alpha_r 
-	xvmaddadp	vs27,	vs11,	alpha_r 
-
-	xvmaddadp	vs28,	vs12,	alpha_r 
-	xvmaddadp	vs29,	vs13,	alpha_r 
-	xvmaddadp	vs30,	vs14,	alpha_r 
-	xvmaddadp	vs31,	vs15,	alpha_r 
-#else
-	xvmuldp	vs24,	vs8,	alpha_r 
-	xvmuldp	vs25,	vs9,	alpha_r 
-	xvmuldp	vs26,	vs10,	alpha_r 
-	xvmuldp	vs27,	vs11,	alpha_r 
-
-	xvmuldp	vs28,	vs12,	alpha_r 
-	xvmuldp	vs29,	vs13,	alpha_r 
-	xvmuldp	vs30,	vs14,	alpha_r 
-	xvmuldp	vs31,	vs15,	alpha_r 
-#endif
- 	stxv		vs0,	0(C3)
-	stxv		vs2,	16(C3)
-	stxv		vs4,	32(C3)
-	stxv		vs6,	48(C3)
-
-	stxv		vs24,	64(C3)
-	stxv		vs26,	80(C3)
-	stxv		vs28,	96(C3)
-	stxv		vs30,	112(C3)
-
-	stxv		vs1,	0(C4)
-	stxv		vs3,	16(C4)
-	stxv		vs5,	32(C4)
-	stxv		vs7,	48(C4)
-	
-	stxv		vs25,	64(C4)
-	stxv		vs27,	80(C4)
-	stxv		vs29,	96(C4)
-	stxv		vs31,	112(C4)	
-
-	addi		CO,	CO,	128
-.endm
-
-/*********************************************************************
-* Macros for N=4, M=8                                                *
-*********************************************************************/
-
-.macro LOAD4x8_1
-   LOAD4x8 1
-.endm
-
-.macro LOAD4x8_0
-   LOAD4x8 0
-.endm
-.macro LOAD4x8  Zero
-
-	lxv	vs24,	0(BO)
-	lxv	vs26,	16(BO)
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-
-	lxv	vs0,	 0(AO)
-	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO)
- 
-
-
-.if \Zero==1 
-    xxlxor		vs32,vs32,vs32
-    xxlxor		vs33,vs33,vs33
-	xxlxor		vs34,vs34,vs34
-	xxlxor		vs35,vs35,vs35
-
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-
-	xxlxor		vs48,	vs48,	vs48
-	xxlxor		vs49,	vs49,	vs49
-	xxlxor		vs50,	vs50,	vs50
-	xxlxor		vs51,	vs51,	vs51 
-
-	xxlxor		vs56,	vs56,	vs56
-	xxlxor		vs57,	vs57,	vs57
-	xxlxor		vs58,	vs58,	vs58
-	xxlxor		vs59,	vs59,	vs59 
-
-.endif
-.endm
-
-  
- 
-.macro KERNEL4x8_L1_L2  Index,IsLast
-  KERNEL4x8_L1_L2_I  0,0,0, \Index,\IsLast,0
-.endm
-
-
-
-.macro KERNEL4x8_I1_L2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L2_I  1,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I1_L2_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x8_L1_L2_I  First, OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs8,	DISP16(\Index,0+\OffsetA)(AO)
-	lxv	vs9,	DISP16(\Index,16+\OffsetA)(AO)
-.if \First ==1
-	xvmuldp		vs32,	vs0,	vs24
-	xvmuldp		vs33,	vs1,	vs24
-	xvmuldp		vs34,	vs2,	vs24
-	xvmuldp		vs35,	vs3,	vs24
-.else
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-.endif
-
-	lxv	vs10,	DISP16(\Index,32+\OffsetA)(AO)
-	lxv	vs11,	DISP16(\Index,48+\OffsetA)(AO)
-
-
-
-.if \First ==1
-	xvmuldp		vs40,	vs0,	vs25
-	xvmuldp		vs41,	vs1,	vs25
-	xvmuldp		vs42,	vs2,	vs25
-	xvmuldp		vs43,	vs3,	vs25
-
-
-	xvmuldp		vs48,	vs0,	vs26
-	xvmuldp		vs49,	vs1,	vs26
-	xvmuldp		vs50,	vs2,	vs26
-	xvmuldp		vs51,	vs3,	vs26
-
-
-.else
-
-	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(BO)
-	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(BO)
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-	xvmaddadp		vs50,	vs2,	vs26
-	xvmaddadp		vs51,	vs3,	vs26
-
-.endif
-	xxpermdi	vs29,	vs28,	vs28,2	
-	xxpermdi	vs31,	vs30,	vs30,2
-.if \First ==1
-	xvmuldp		vs56,	vs0,	vs27
-	xvmuldp		vs57,	vs1,	vs27
-	xvmuldp		vs58,	vs2,	vs27
-	xvmuldp		vs59,	vs3,	vs27
-
-.else
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-	xvmaddadp		vs58,	vs2,	vs27
-	xvmaddadp		vs59,	vs3,	vs27
-
-.endif
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-.if \Complete==0
-	lxv	vs0,	DISP16(\Index,64+\OffsetA)(AO)
-	lxv	vs1,	DISP16(\Index,80+\OffsetA)(AO) 
-.endif
-
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-
-.if \Complete==0 
-	lxv	vs2,	DISP16(\Index,96+\OffsetA)(AO)
-	lxv	vs3,	DISP16(\Index,112+\OffsetA)(AO)
-.endif	
-
-
-	xvmaddadp		vs48,	vs8,	vs30
-	xvmaddadp		vs49,	vs9,	vs30
-	xvmaddadp		vs50,	vs10,	vs30
-	xvmaddadp		vs51,	vs11,	vs30
-.if \Complete==0
-	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(BO)
-	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(BO) 
-.endif
- 
-	xvmaddadp		vs56,	vs8,	vs31
-	xvmaddadp		vs57,	vs9,	vs31
-	xvmaddadp		vs58,	vs10,	vs31
-	xvmaddadp		vs59,	vs11,	vs31
-.if \Complete==0 
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-.endif
-
-  .if \IsLast==1	
-  .if \Complete==1
-	addi		AO, AO, DISP16(\Index,64+\OffsetA)
-	addi		BO, BO,  DISP8(\Index,32+\OffsetB)
-  .else
-	addi		AO, AO, DISP16(\Index,128)
-	addi		BO, BO,  DISP8(\Index,64)
-  .endif
-  .endif
-  
-
-.endm
-
- 
-
-.macro KERNEL4x8 First
-
-	lxv	vs24,	0(BO)
-	lxv	vs26,	16(BO)
-	xxpermdi	vs25,	vs24,	vs24,2	
-	xxpermdi	vs27,	vs26,	vs26,2
-
-	lxv	vs0,	0(AO)
-	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO) 
-
-
-
- 
-	addi		BO, BO, 32
-    addi		AO, AO, 64
-
-.if \First==1
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
- 
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
- 
-
-	xvmuldp			vs48,	vs0,	vs26
-	xvmuldp			vs49,	vs1,	vs26
-	xvmuldp			vs50,	vs2,	vs26
-	xvmuldp			vs51,	vs3,	vs26
- 
-
-	xvmuldp			vs56,	vs0,	vs27
-	xvmuldp			vs57,	vs1,	vs27
-	xvmuldp			vs58,	vs2,	vs27
-	xvmuldp			vs59,	vs3,	vs27
- 
-.else
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
- 
-
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-	xvmaddadp		vs50,	vs2,	vs26
-	xvmaddadp		vs51,	vs3,	vs26
- 
-
-
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-	xvmaddadp		vs58,	vs2,	vs27
-	xvmaddadp		vs59,	vs3,	vs27
-
-
-.endif
-.endm
-
- 
-
-.macro SAVE4x8
-	add		T2,	CO,	LDC
-	add		T3,	T2,	LDC
-	add		T4,	T3,	LDC
-#ifndef TRMMKERNEL
-	lxv		vs0,	0(CO)
-	lxv		vs2,	16(CO)
-#endif	
-	xxpermdi  vs8, vs40,vs32,1
- 	xxpermdi  vs9 ,vs32,vs40,1
-#ifndef TRMMKERNEL	 
-	lxv		vs4,	32(CO)
-	lxv		vs6,	48(CO)
-#endif	
-	xxpermdi  vs10, vs41,vs33,1		 
- 	xxpermdi  vs11 ,vs33,vs41,1
-#ifndef TRMMKERNEL	 
-	lxv		vs1,	0(T2)
-	lxv		vs3,	16(T2)
-#endif	
-	xxpermdi  vs12, vs42,vs34,1
- 	xxpermdi  vs13 ,vs34,vs42,1
-#ifndef TRMMKERNEL	 
-	lxv		vs5,	32(T2)
-	lxv		vs7,	48(T2)
-#endif	
-	xxpermdi  vs14, vs43,vs35,1		 
- 	xxpermdi  vs15 ,vs35,vs43,1	
- 
-
-
-#ifndef TRMMKERNEL 
-	xvmaddadp	vs0,	vs8,	alpha_r 
-	xvmaddadp	vs1,	vs9,	alpha_r 
-	xvmaddadp	vs2,	vs10,	alpha_r 
-	xvmaddadp	vs3,	vs11,	alpha_r 
-
-	xvmaddadp	vs4,	vs12,	alpha_r 
-	xvmaddadp	vs5,	vs13,	alpha_r 
-	xvmaddadp	vs6,	vs14,	alpha_r 
-	xvmaddadp	vs7,	vs15,	alpha_r 
-#else
-	xvmuldp	vs0,	vs8,	alpha_r 
-	xvmuldp	vs1,	vs9,	alpha_r 
-	xvmuldp	vs2,	vs10,	alpha_r 
-	xvmuldp	vs3,	vs11,	alpha_r 
-
-	xvmuldp	vs4,	vs12,	alpha_r 
-	xvmuldp	vs5,	vs13,	alpha_r 
-	xvmuldp	vs6,	vs14,	alpha_r 
-	xvmuldp	vs7,	vs15,	alpha_r 
-
-#endif
- 
-
-	stxv		vs0,	0(CO)
-	stxv		vs2,	16(CO)
-	stxv		vs4,	32(CO)
-	stxv		vs6,	48(CO)
-
- 
-	stxv		vs1,	0(T2)
-	stxv		vs3,	16(T2)
-	stxv		vs5,	32(T2)
-	stxv		vs7,	48(T2)
-	
- 
-	xxpermdi  vs8, vs56,vs48,1
- 	xxpermdi  vs9 ,vs48,vs56,1
-#ifndef TRMMKERNEL 
- 	lxv		vs0,	0(T3)
-	lxv		vs2,	16(T3)
-#endif	
-	xxpermdi  vs10, vs57,vs49,1		 
- 	xxpermdi  vs11 ,vs49,vs57,1	
-#ifndef TRMMKERNEL 	 
-	lxv		vs4,	32(T3)
-	lxv		vs6,	48(T3)
-#endif 
-	xxpermdi  vs12, vs58,vs50,1
- 	xxpermdi  vs13 ,vs50,vs58,1
-#ifndef TRMMKERNEL 	 
-	lxv		vs1,	0(T4)
-	lxv		vs3,	16(T4)
-#endif	
-	xxpermdi  vs14, vs59,vs51,1		 
- 	xxpermdi  vs15 ,vs51,vs59,1	
-#ifndef TRMMKERNEL 	 
-	lxv		vs5,	32(T4)
-	lxv		vs7,	48(T4)
- 
- 
-	xvmaddadp	vs0,	vs8,	alpha_r 
-	xvmaddadp	vs1,	vs9,	alpha_r 
-	xvmaddadp	vs2,	vs10,	alpha_r 
-	xvmaddadp	vs3,	vs11,	alpha_r 
-	
-
-
-	xvmaddadp	vs4,	vs12,	alpha_r 
-	xvmaddadp	vs5,	vs13,	alpha_r 
-	xvmaddadp	vs6,	vs14,	alpha_r 
-	xvmaddadp	vs7,	vs15,	alpha_r 
-#else
-	xvmuldp	vs0,	vs8,	alpha_r 
-	xvmuldp	vs1,	vs9,	alpha_r 
-	xvmuldp	vs2,	vs10,	alpha_r 
-	xvmuldp	vs3,	vs11,	alpha_r 
-	
-
-
-	xvmuldp	vs4,	vs12,	alpha_r 
-	xvmuldp	vs5,	vs13,	alpha_r 
-	xvmuldp	vs6,	vs14,	alpha_r 
-	xvmuldp	vs7,	vs15,	alpha_r 
-
-#endif
-
-
- 	stxv		vs0,	0(T3)
-	stxv		vs2,	16(T3)
-	stxv		vs4,	32(T3)
-	stxv		vs6,	48(T3)
-
- 
-	stxv		vs1,	0(T4)
-	stxv		vs3,	16(T4)
-	stxv		vs5,	32(T4)
-	stxv		vs7,	48(T4)
-	
- 
-
-	addi		CO,	CO,	64
-.endm
-
-
-/*********************************************************************
-* Macros for N=4, M=4                                                *
-*********************************************************************/
-
-.macro LOAD4x4_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-.endm
-
-.macro KERNEL4x4_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-	lxvdsx	vs30,	o16,	BO
-	lxvdsx	vs31,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-
-	xvmuldp			vs48,	vs0,	vs26
-	xvmuldp			vs49,	vs1,	vs26
-
-	xvmuldp			vs56,	vs0,	vs27
-	xvmuldp			vs57,	vs1,	vs27
-
-.endm
-
-.macro KERNEL4x4_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-	lxvdsx	vs30,	o16,	BO
-	lxvdsx	vs31,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-
-.endm
-
-.macro KERNEL4x4_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-
-	xvmaddadp		vs48,	vs8,	vs30
-	xvmaddadp		vs49,	vs9,	vs30
-
-	xvmaddadp		vs56,	vs8,	vs31
-	xvmaddadp		vs57,	vs9,	vs31
-
-.endm
-
-.macro KERNEL4x4_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-
-	xvmaddadp		vs48,	vs8,	vs30
-	xvmaddadp		vs49,	vs9,	vs30
-
-	xvmaddadp		vs56,	vs8,	vs31
-	xvmaddadp		vs57,	vs9,	vs31
-
-.endm
-
-.macro KERNEL4x4_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-
-	xvmuldp			vs48,	vs0,	vs26
-	xvmuldp			vs49,	vs1,	vs26
-
-	xvmuldp			vs56,	vs0,	vs27
-	xvmuldp			vs57,	vs1,	vs27
-
-.endm
-
-.macro KERNEL4x4_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-
-	xvmaddadp		vs48,	vs0,	vs26
-	xvmaddadp		vs49,	vs1,	vs26
-
-	xvmaddadp		vs56,	vs0,	vs27
-	xvmaddadp		vs57,	vs1,	vs27
-
-.endm
-
-.macro SAVE4x4
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-	lxvd2x		vs9,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-	xvmaddadp	vs9,	vs41,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-	xvmuldp		vs9,	vs41,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-	stxvd2x		vs9,	o16,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs48,	alpha_r
-	xvmaddadp	vs1,	vs49,	alpha_r
-#else
-	xvmuldp		vs0,	vs48,	alpha_r
-	xvmuldp		vs1,	vs49,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-	lxvd2x		vs9,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs56,	alpha_r
-	xvmaddadp	vs9,	vs57,	alpha_r
-#else
-	xvmuldp		vs8,	vs56,	alpha_r
-	xvmuldp		vs9,	vs57,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-	stxvd2x		vs9,	o16,	T1
-
-	addi		CO,	CO,	32
-
-.endm
-
-/*********************************************************************
-* Macros for N=4, M=2                                                *
-*********************************************************************/
-
-.macro LOAD4x2_1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-.endm
-
-.macro KERNEL4x2_I1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-	lxvdsx	vs30,	o16,	BO
-	lxvdsx	vs31,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-
-	xvmuldp			vs48,	vs0,	vs26
-
-	xvmuldp			vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x2_1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-	lxvdsx	vs30,	o16,	BO
-	lxvdsx	vs31,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-
-	xvmaddadp		vs48,	vs0,	vs26
-
-	xvmaddadp		vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x2_2
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-
-	xvmaddadp		vs48,	vs8,	vs30
-
-	xvmaddadp		vs56,	vs8,	vs31
-
-.endm
-
-.macro KERNEL4x2_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-
-	xvmaddadp		vs48,	vs8,	vs30
-
-	xvmaddadp		vs56,	vs8,	vs31
-
-.endm
-
-.macro KERNEL4x2_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-
-	xvmuldp			vs48,	vs0,	vs26
-
-	xvmuldp			vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x2_SUB1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-	lxvdsx	vs26,	o16,	BO
-	lxvdsx	vs27,	o24,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 32
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-
-	xvmaddadp		vs48,	vs0,	vs26
-
-	xvmaddadp		vs56,	vs0,	vs27
-
-.endm
-
-.macro SAVE4x2
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs48,	alpha_r
-#else
-	xvmuldp		vs0,	vs48,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs56,	alpha_r
-#else
-	xvmuldp		vs8,	vs56,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-
-	addi		CO,	CO,	16
-
-.endm
-
-/*********************************************************************
-* Macros for N=4, M=1                                                *
-*********************************************************************/
-
-.macro LOAD4x1_1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-	lxsdx	vs26,	o16,	BO
-	lxsdx	vs27,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-.endm
-
-.macro KERNEL4x1_I1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-	lxsdx	vs29,	o8,	BO
-	lxsdx	vs30,	o16,	BO
-	lxsdx	vs31,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-	xsmuldp			vs40,	vs0,	vs25
-
-	xsmuldp			vs48,	vs0,	vs26
-
-	xsmuldp			vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x1_1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-	lxsdx	vs29,	o8,	BO
-	lxsdx	vs30,	o16,	BO
-	lxsdx	vs31,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-	xsmaddadp		vs40,	vs0,	vs25
-
-	xsmaddadp		vs48,	vs0,	vs26
-
-	xsmaddadp		vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x1_2
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-	lxsdx	vs26,	o16,	BO
-	lxsdx	vs27,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-	xsmaddadp		vs40,	vs8,	vs29
-
-	xsmaddadp		vs48,	vs8,	vs30
-
-	xsmaddadp		vs56,	vs8,	vs31
-
-.endm
-
-.macro KERNEL4x1_E2
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-	xsmaddadp		vs40,	vs8,	vs29
-
-	xsmaddadp		vs48,	vs8,	vs30
-
-	xsmaddadp		vs56,	vs8,	vs31
-
-.endm
-
-.macro KERNEL4x1_SUBI1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-	lxsdx	vs26,	o16,	BO
-	lxsdx	vs27,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-	xsmuldp			vs40,	vs0,	vs25
-
-	xsmuldp			vs48,	vs0,	vs26
-
-	xsmuldp			vs56,	vs0,	vs27
-
-.endm
-
-.macro KERNEL4x1_SUB1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-	lxsdx	vs26,	o16,	BO
-	lxsdx	vs27,	o24,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 32
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-	xsmaddadp		vs40,	vs0,	vs25
-
-	xsmaddadp		vs48,	vs0,	vs26
-
-	xsmaddadp		vs56,	vs0,	vs27
-
-.endm
-
-.macro SAVE4x1
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxsdx		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs0,	vs32,	alpha_r
-#else
-	xsmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxsdx		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxsdx		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs8,	vs40,	alpha_r
-#else
-	xsmuldp		vs8,	vs40,	alpha_r
-#endif
-
-	stxsdx		vs8,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxsdx		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs0,	vs48,	alpha_r
-#else
-	xsmuldp		vs0,	vs48,	alpha_r
-#endif
-
-	stxsdx		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxsdx		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs8,	vs56,	alpha_r
-#else
-	xsmuldp		vs8,	vs56,	alpha_r
-#endif
-
-	stxsdx		vs8,	0,	T1
-
-	addi		CO,	CO,	8
-
-.endm
-
-/*********************************************************************
-* Macros for N=2, M=16                                               *
-*********************************************************************/
-
-.macro LOAD2x16_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-.endm
-
-.macro KERNEL2x16_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs12,	0,	AO
-	lxvd2x	vs13,	o16,	AO
-	lxvd2x	vs14,	o32,	AO
-	lxvd2x	vs15,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-	xvmuldp			vs36,	vs4,	vs24
-	xvmuldp			vs37,	vs5,	vs24
-	xvmuldp			vs38,	vs6,	vs24
-	xvmuldp			vs39,	vs7,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
-	xvmuldp			vs44,	vs4,	vs25
-	xvmuldp			vs45,	vs5,	vs25
-	xvmuldp			vs46,	vs6,	vs25
-	xvmuldp			vs47,	vs7,	vs25
-
-.endm
-
-.macro KERNEL2x16_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs12,	0,	AO
-	lxvd2x	vs13,	o16,	AO
-	lxvd2x	vs14,	o32,	AO
-	lxvd2x	vs15,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-	xvmaddadp		vs44,	vs4,	vs25
-	xvmaddadp		vs45,	vs5,	vs25
-	xvmaddadp		vs46,	vs6,	vs25
-	xvmaddadp		vs47,	vs7,	vs25
-
-.endm
-
-.macro KERNEL2x16_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-	xvmaddadp		vs36,	vs12,	vs28
-	xvmaddadp		vs37,	vs13,	vs28
-	xvmaddadp		vs38,	vs14,	vs28
-	xvmaddadp		vs39,	vs15,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-	xvmaddadp		vs44,	vs12,	vs29
-	xvmaddadp		vs45,	vs13,	vs29
-	xvmaddadp		vs46,	vs14,	vs29
-	xvmaddadp		vs47,	vs15,	vs29
-
-.endm
-
-.macro KERNEL2x16_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-	xvmaddadp		vs36,	vs12,	vs28
-	xvmaddadp		vs37,	vs13,	vs28
-	xvmaddadp		vs38,	vs14,	vs28
-	xvmaddadp		vs39,	vs15,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-	xvmaddadp		vs44,	vs12,	vs29
-	xvmaddadp		vs45,	vs13,	vs29
-	xvmaddadp		vs46,	vs14,	vs29
-	xvmaddadp		vs47,	vs15,	vs29
-
-.endm
-
-.macro KERNEL2x16_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-	xvmuldp			vs36,	vs4,	vs24
-	xvmuldp			vs37,	vs5,	vs24
-	xvmuldp			vs38,	vs6,	vs24
-	xvmuldp			vs39,	vs7,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
-	xvmuldp			vs44,	vs4,	vs25
-	xvmuldp			vs45,	vs5,	vs25
-	xvmuldp			vs46,	vs6,	vs25
-	xvmuldp			vs47,	vs7,	vs25
-
-.endm
-
-.macro KERNEL2x16_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-	xvmaddadp		vs44,	vs4,	vs25
-	xvmaddadp		vs45,	vs5,	vs25
-	xvmaddadp		vs46,	vs6,	vs25
-	xvmaddadp		vs47,	vs7,	vs25
-
-.endm
-
-.macro SAVE2x16
-
-	mr		T1,	CO
-	addi		T2,	T1,	64
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-	lxvd2x		vs2,	o32,	T1
-	lxvd2x		vs3,	o48,	T1
-
-	lxvd2x		vs4,	0,	T2
-	lxvd2x		vs5,	o16,	T2
-	lxvd2x		vs6,	o32,	T2
-	lxvd2x		vs7,	o48,	T2
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-	xvmaddadp	vs2,	vs34,	alpha_r
-	xvmaddadp	vs3,	vs35,	alpha_r
-	xvmaddadp	vs4,	vs36,	alpha_r
-	xvmaddadp	vs5,	vs37,	alpha_r
-	xvmaddadp	vs6,	vs38,	alpha_r
-	xvmaddadp	vs7,	vs39,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-	xvmuldp		vs2,	vs34,	alpha_r
-	xvmuldp		vs3,	vs35,	alpha_r
-	xvmuldp		vs4,	vs36,	alpha_r
-	xvmuldp		vs5,	vs37,	alpha_r
-	xvmuldp		vs6,	vs38,	alpha_r
-	xvmuldp		vs7,	vs39,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-	stxvd2x		vs2,	o32,	T1
-	stxvd2x		vs3,	o48,	T1
-
-	stxvd2x		vs4,	0,	T2
-	stxvd2x		vs5,	o16,	T2
-	stxvd2x		vs6,	o32,	T2
-	stxvd2x		vs7,	o48,	T2
-
-	add		T1,	T1,	LDC
-	add		T2,	T2,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-	lxvd2x		vs9,	o16,	T1
-	lxvd2x		vs10,	o32,	T1
-	lxvd2x		vs11,	o48,	T1
-
-	lxvd2x		vs12,	0,	T2
-	lxvd2x		vs13,	o16,	T2
-	lxvd2x		vs14,	o32,	T2
-	lxvd2x		vs15,	o48,	T2
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-	xvmaddadp	vs9,	vs41,	alpha_r
-	xvmaddadp	vs10,	vs42,	alpha_r
-	xvmaddadp	vs11,	vs43,	alpha_r
-	xvmaddadp	vs12,	vs44,	alpha_r
-	xvmaddadp	vs13,	vs45,	alpha_r
-	xvmaddadp	vs14,	vs46,	alpha_r
-	xvmaddadp	vs15,	vs47,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-	xvmuldp		vs9,	vs41,	alpha_r
-	xvmuldp		vs10,	vs42,	alpha_r
-	xvmuldp		vs11,	vs43,	alpha_r
-	xvmuldp		vs12,	vs44,	alpha_r
-	xvmuldp		vs13,	vs45,	alpha_r
-	xvmuldp		vs14,	vs46,	alpha_r
-	xvmuldp		vs15,	vs47,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-	stxvd2x		vs9,	o16,	T1
-	stxvd2x		vs10,	o32,	T1
-	stxvd2x		vs11,	o48,	T1
-
-	stxvd2x		vs12,	0,	T2
-	stxvd2x		vs13,	o16,	T2
-	stxvd2x		vs14,	o32,	T2
-	stxvd2x		vs15,	o48,	T2
-
-	addi		CO,	CO,	128
-
-.endm
-
-/*********************************************************************
-* Macros for N=4, M=8                                                *
-*********************************************************************/
-
-.macro LOAD2x8_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-.endm
-
-.macro KERNEL2x8_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
-
-.endm
-
-.macro KERNEL2x8_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-
-.endm
-
-.macro KERNEL2x8_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-
-.endm
-
-.macro KERNEL2x8_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-	xvmaddadp		vs42,	vs10,	vs29
-	xvmaddadp		vs43,	vs11,	vs29
-
-.endm
-
-.macro KERNEL2x8_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-	xvmuldp			vs42,	vs2,	vs25
-	xvmuldp			vs43,	vs3,	vs25
-
-.endm
-
-.macro KERNEL2x8_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-	xvmaddadp		vs42,	vs2,	vs25
-	xvmaddadp		vs43,	vs3,	vs25
-
-.endm
-
-.macro SAVE2x8
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-	lxvd2x		vs2,	o32,	T1
-	lxvd2x		vs3,	o48,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-	xvmaddadp	vs2,	vs34,	alpha_r
-	xvmaddadp	vs3,	vs35,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-	xvmuldp		vs2,	vs34,	alpha_r
-	xvmuldp		vs3,	vs35,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-	stxvd2x		vs2,	o32,	T1
-	stxvd2x		vs3,	o48,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-	lxvd2x		vs9,	o16,	T1
-	lxvd2x		vs10,	o32,	T1
-	lxvd2x		vs11,	o48,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-	xvmaddadp	vs9,	vs41,	alpha_r
-	xvmaddadp	vs10,	vs42,	alpha_r
-	xvmaddadp	vs11,	vs43,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-	xvmuldp		vs9,	vs41,	alpha_r
-	xvmuldp		vs10,	vs42,	alpha_r
-	xvmuldp		vs11,	vs43,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-	stxvd2x		vs9,	o16,	T1
-	stxvd2x		vs10,	o32,	T1
-	stxvd2x		vs11,	o48,	T1
-
-	addi		CO,	CO,	64
-
-.endm
-
-/*********************************************************************
-* Macros for N=2, M=4                                                *
-*********************************************************************/
-
-.macro LOAD2x4_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-.endm
-
-.macro KERNEL2x4_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-
-.endm
-
-.macro KERNEL2x4_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-
-.endm
-
-.macro KERNEL2x4_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-
-.endm
-
-.macro KERNEL2x4_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-	xvmaddadp		vs41,	vs9,	vs29
-
-.endm
-
-.macro KERNEL2x4_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-	xvmuldp			vs41,	vs1,	vs25
-
-.endm
-
-.macro KERNEL2x4_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-	xvmaddadp		vs41,	vs1,	vs25
-
-.endm
-
-.macro SAVE2x4
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-	lxvd2x		vs9,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-	xvmaddadp	vs9,	vs41,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-	xvmuldp		vs9,	vs41,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-	stxvd2x		vs9,	o16,	T1
-
-	addi		CO,	CO,	32
-
-.endm
-
-/*********************************************************************
-* Macros for N=2, M=2                                                *
-*********************************************************************/
-
-.macro LOAD2x2_1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-.endm
-
-.macro KERNEL2x2_I1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x2_1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-	lxvdsx	vs29,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x2_2
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-
-.endm
-
-.macro KERNEL2x2_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-	xvmaddadp		vs40,	vs8,	vs29
-
-.endm
-
-.macro KERNEL2x2_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-	xvmuldp			vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x2_SUB1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-	lxvdsx	vs25,	o8,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 16
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-	xvmaddadp		vs40,	vs0,	vs25
-
-.endm
-
-.macro SAVE2x2
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs8,	vs40,	alpha_r
-#else
-	xvmuldp		vs8,	vs40,	alpha_r
-#endif
-
-	stxvd2x		vs8,	0,	T1
-
-	addi		CO,	CO,	16
-
-.endm
-
-/*********************************************************************
-* Macros for N=2, M=1                                                *
-*********************************************************************/
-
-.macro LOAD2x1_1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-.endm
-
-.macro KERNEL2x1_I1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-	lxsdx	vs29,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-	xsmuldp			vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x1_1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-	lxsdx	vs29,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-	xsmaddadp		vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x1_2
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-	xsmaddadp		vs40,	vs8,	vs29
-
-.endm
-
-.macro KERNEL2x1_E2
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-	xsmaddadp		vs40,	vs8,	vs29
-
-.endm
-
-.macro KERNEL2x1_SUBI1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-	xsmuldp			vs40,	vs0,	vs25
-
-.endm
-
-.macro KERNEL2x1_SUB1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-	lxsdx	vs25,	o8,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 16
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-	xsmaddadp		vs40,	vs0,	vs25
-
-.endm
-
-.macro SAVE2x1
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxsdx		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs0,	vs32,	alpha_r
-#else
-	xsmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxsdx		vs0,	0,	T1
-
-	add		T1,	T1,	LDC
-
-#ifndef TRMMKERNEL
-	lxsdx		vs8,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs8,	vs40,	alpha_r
-#else
-	xsmuldp		vs8,	vs40,	alpha_r
-#endif
-
-	stxsdx		vs8,	0,	T1
-
-	addi		CO,	CO,	8
-
-.endm
-
-/*********************************************************************
-* Macros for N=1, M=16                                               *
-*********************************************************************/
-
-.macro LOAD1x16_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-.endm
-
-.macro KERNEL1x16_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs12,	0,	AO
-	lxvd2x	vs13,	o16,	AO
-	lxvd2x	vs14,	o32,	AO
-	lxvd2x	vs15,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-	xvmuldp			vs36,	vs4,	vs24
-	xvmuldp			vs37,	vs5,	vs24
-	xvmuldp			vs38,	vs6,	vs24
-	xvmuldp			vs39,	vs7,	vs24
-
-.endm
-
-.macro KERNEL1x16_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs12,	0,	AO
-	lxvd2x	vs13,	o16,	AO
-	lxvd2x	vs14,	o32,	AO
-	lxvd2x	vs15,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-
-.endm
-
-.macro KERNEL1x16_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-	xvmaddadp		vs36,	vs12,	vs28
-	xvmaddadp		vs37,	vs13,	vs28
-	xvmaddadp		vs38,	vs14,	vs28
-	xvmaddadp		vs39,	vs15,	vs28
-
-.endm
-
-.macro KERNEL1x16_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-	xvmaddadp		vs36,	vs12,	vs28
-	xvmaddadp		vs37,	vs13,	vs28
-	xvmaddadp		vs38,	vs14,	vs28
-	xvmaddadp		vs39,	vs15,	vs28
-
-.endm
-
-.macro KERNEL1x16_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-	xvmuldp			vs36,	vs4,	vs24
-	xvmuldp			vs37,	vs5,	vs24
-	xvmuldp			vs38,	vs6,	vs24
-	xvmuldp			vs39,	vs7,	vs24
-
-.endm
-
-.macro KERNEL1x16_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-	lxvd2x	vs4,	0,	AO
-	lxvd2x	vs5,	o16,	AO
-	lxvd2x	vs6,	o32,	AO
-	lxvd2x	vs7,	o48,	AO
-
-	addi		AO, AO, 64
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-	xvmaddadp		vs36,	vs4,	vs24
-	xvmaddadp		vs37,	vs5,	vs24
-	xvmaddadp		vs38,	vs6,	vs24
-	xvmaddadp		vs39,	vs7,	vs24
-
-.endm
-
-.macro SAVE1x16
-
-	mr		T1,	CO
-	addi		T2,	T1,	64
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-	lxvd2x		vs2,	o32,	T1
-	lxvd2x		vs3,	o48,	T1
-
-	lxvd2x		vs4,	0,	T2
-	lxvd2x		vs5,	o16,	T2
-	lxvd2x		vs6,	o32,	T2
-	lxvd2x		vs7,	o48,	T2
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-	xvmaddadp	vs2,	vs34,	alpha_r
-	xvmaddadp	vs3,	vs35,	alpha_r
-	xvmaddadp	vs4,	vs36,	alpha_r
-	xvmaddadp	vs5,	vs37,	alpha_r
-	xvmaddadp	vs6,	vs38,	alpha_r
-	xvmaddadp	vs7,	vs39,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-	xvmuldp		vs2,	vs34,	alpha_r
-	xvmuldp		vs3,	vs35,	alpha_r
-	xvmuldp		vs4,	vs36,	alpha_r
-	xvmuldp		vs5,	vs37,	alpha_r
-	xvmuldp		vs6,	vs38,	alpha_r
-	xvmuldp		vs7,	vs39,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-	stxvd2x		vs2,	o32,	T1
-	stxvd2x		vs3,	o48,	T1
-
-	stxvd2x		vs4,	0,	T2
-	stxvd2x		vs5,	o16,	T2
-	stxvd2x		vs6,	o32,	T2
-	stxvd2x		vs7,	o48,	T2
-
-	addi		CO,	CO,	128
-
-.endm
-
-/*********************************************************************
-* Macros for N=4, M=8                                                *
-*********************************************************************/
-
-.macro LOAD1x8_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-.endm
-
-.macro KERNEL1x8_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-
-.endm
-
-.macro KERNEL1x8_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-	lxvd2x	vs10,	o32,	AO
-	lxvd2x	vs11,	o48,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-
-.endm
-
-.macro KERNEL1x8_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-
-.endm
-
-.macro KERNEL1x8_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-	xvmaddadp		vs34,	vs10,	vs28
-	xvmaddadp		vs35,	vs11,	vs28
-
-.endm
-
-.macro KERNEL1x8_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-	xvmuldp			vs34,	vs2,	vs24
-	xvmuldp			vs35,	vs3,	vs24
-
-.endm
-
-.macro KERNEL1x8_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-	lxvd2x	vs2,	o32,	AO
-	lxvd2x	vs3,	o48,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 64
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-	xvmaddadp		vs34,	vs2,	vs24
-	xvmaddadp		vs35,	vs3,	vs24
-
-.endm
-
-.macro SAVE1x8
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-	lxvd2x		vs2,	o32,	T1
-	lxvd2x		vs3,	o48,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-	xvmaddadp	vs2,	vs34,	alpha_r
-	xvmaddadp	vs3,	vs35,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-	xvmuldp		vs2,	vs34,	alpha_r
-	xvmuldp		vs3,	vs35,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-	stxvd2x		vs2,	o32,	T1
-	stxvd2x		vs3,	o48,	T1
-
-	addi		CO,	CO,	64
-
-.endm
-
-/*********************************************************************
-* Macros for N=1, M=4                                                *
-*********************************************************************/
-
-.macro LOAD1x4_1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-.endm
-
-.macro KERNEL1x4_I1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-.endm
-
-.macro KERNEL1x4_1
-
-	lxvd2x	vs8,	0,	AO
-	lxvd2x	vs9,	o16,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-.endm
-
-.macro KERNEL1x4_2
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-.endm
-
-.macro KERNEL1x4_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-	xvmaddadp		vs33,	vs9,	vs28
-
-.endm
-
-.macro KERNEL1x4_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-	xvmuldp			vs33,	vs1,	vs24
-
-.endm
-
-.macro KERNEL1x4_SUB1
-
-	lxvd2x	vs0,	0,	AO
-	lxvd2x	vs1,	o16,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 32
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-	xvmaddadp		vs33,	vs1,	vs24
-
-.endm
-
-.macro SAVE1x4
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-	lxvd2x		vs1,	o16,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-	xvmaddadp	vs1,	vs33,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-	xvmuldp		vs1,	vs33,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-	stxvd2x		vs1,	o16,	T1
-
-	addi		CO,	CO,	32
-
-.endm
-
-/*********************************************************************
-* Macros for N=1, M=2                                                *
-*********************************************************************/
-
-.macro LOAD1x2_1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-.endm
-
-.macro KERNEL1x2_I1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x2_1
-
-	lxvd2x	vs8,	0,	AO
-
-	lxvdsx	vs28,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x2_2
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-.endm
-
-.macro KERNEL1x2_E2
-
-
-	xvmaddadp		vs32,	vs8,	vs28
-
-.endm
-
-.macro KERNEL1x2_SUBI1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-
-	xvmuldp			vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x2_SUB1
-
-	lxvd2x	vs0,	0,	AO
-
-	lxvdsx	vs24,	0,	BO
-
-	addi		AO, AO, 16
-	addi		BO, BO, 8
-
-
-	xvmaddadp		vs32,	vs0,	vs24
-
-.endm
-
-.macro SAVE1x2
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxvd2x		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xvmaddadp	vs0,	vs32,	alpha_r
-#else
-	xvmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxvd2x		vs0,	0,	T1
-
-	addi		CO,	CO,	16
-
-.endm
-
-/*********************************************************************
-* Macros for N=1, M=1                                                *
-*********************************************************************/
-
-.macro LOAD1x1_1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-.endm
-
-.macro KERNEL1x1_I1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x1_1
-
-	lxsdx	vs8,	0,	AO
-
-	lxsdx	vs28,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x1_2
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-.endm
-
-.macro KERNEL1x1_E2
-
-
-	xsmaddadp		vs32,	vs8,	vs28
-
-.endm
-
-.macro KERNEL1x1_SUBI1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-
-	xsmuldp			vs32,	vs0,	vs24
-
-.endm
-
-.macro KERNEL1x1_SUB1
-
-	lxsdx	vs0,	0,	AO
-
-	lxsdx	vs24,	0,	BO
-
-	addi		AO, AO, 8
-	addi		BO, BO, 8
-
-
-	xsmaddadp		vs32,	vs0,	vs24
-
-.endm
-
-.macro SAVE1x1
-
-	mr		T1,	CO
-
-#ifndef TRMMKERNEL
-	lxsdx		vs0,	0,	T1
-#endif
-
-#ifndef TRMMKERNEL
-	xsmaddadp	vs0,	vs32,	alpha_r
-#else
-	xsmuldp		vs0,	vs32,	alpha_r
-#endif
-
-	stxsdx		vs0,	0,	T1
-
-	addi		CO,	CO,	8
-
-.endm
-
-
-
-
-/****************************TRMM POINTER REFRESH MACROSES*************************/
-
-.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
-		.if \SHIFT_VAL==16 
-			slwi		\REG1,	\REG2,	7			
-		.elseif \SHIFT_VAL==8  
-			slwi		\REG1,	\REG2,	6			 
-		.elseif \SHIFT_VAL==4
-			slwi		\REG1,	\REG2,	5			  
-		.elseif \SHIFT_VAL==2
-			slwi		\REG1,	\REG2,	4			 
-		.elseif \SHIFT_VAL==1
-			slwi		\REG1,	\REG2,	3			 
-		.endif
-.endm
-
-/*
-//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		ptrbb = bb;
-// #else
-// 		ptrba += off*16;
-// 		ptrbb = bb + off*2;
-// #endif
-*/
-.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
-    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
-        /* ptrbb = bb;*/
-        mr \PTR_B,\B_VAL     /* refresh BPOINT */
-
-    #else
-		    /*
-        // ptrba  =ptrba+ off*C_A;
-        // ptrbb = bb + off*C_B; 
-				*/
-		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
-		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
-		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
-		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
-    #endif 
-.endm
-
-
-/*
-// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-// 		temp = bk-off;
-// #elif defined(LEFT)
-// 		temp = off+16;	// number of values in A
-// #else
-// 		temp = off+2;	// number of values in B
-// #endif
-*/
-.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
-    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
-                            /* temp = bk-off;*/
-           sub \TEMP_BK,\BK_VAL,\OFF_VAL
-
-    #elif defined(LEFT)
-                            /* temp = off+INCR_A;	// number of values in A */
-           addi \TEMP_BK, \OFF_VAL, \INCR_A
-    #else
-                            /* temp = off+INCR_B	// number of values in B*/
-           addi \TEMP_BK,\OFF_VAL, \INCR_B
-    #endif
-
-.endm
-/*
-// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		temp = bk - off;
-// #ifdef LEFT
-// 		temp -= 16; // number of values in A
-// #else
-// 		temp -= 2; // number of values in B
-// #endif
-// 		ptrba += temp*16;
-// 		ptrbb += temp*2;
-// #endif
-
-// #ifdef LEFT
-// 		off += 16; // number of values in A
-// #endif
-*/
- 
-
-.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
-
-    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-                    /*temp = bk - off;*/
-                sub \TEMP_BK,\BK_VAL,\OFF_VAL
-    #ifdef LEFT
-                    /*temp -= 8; // number of values in A*/
-                addi \TEMP_BK,\TEMP_BK,-\C_A
-    #else
-                    /*temp -= 4; // number of values in B*/
-                addi \TEMP_BK,\TEMP_BK,-\C_B 
-    #endif
-                    /*ptrba += temp*C_A;
-                    ptrbb += temp*C_B;*/ 
-                SHIFT_REG T4,\TEMP_BK,\C_A
-								SHIFT_REG T2,\TEMP_BK,\C_B
-                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
-								add \PTR_B, \PTR_B,T2 
-
-    #endif
-
-    #ifdef LEFT
-                    /*off += 8; // number of values in A*/
-                 addi \OFF_VAL,\OFF_VAL,\C_A
-    #endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* Abdelrauf(quickwritereader@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+/*********************************************************************
+* Macros for N=4, M=16                                               *
+*********************************************************************/
+.macro LOAD4x16_1
+   LOAD4x16 1
+.endm
+
+.macro LOAD4x16_0
+   LOAD4x16 0
+.endm
+.macro LOAD4x16  Zero
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+ 
+
+	lxv	vs4,	64(AO)
+	lxv	vs5,	80(AO)
+	lxv	vs6,	96(AO)
+	lxv	vs7,	112(AO)
+.if \Zero==1 
+    xxlxor		vs32,vs32,vs32
+    xxlxor		vs33,vs33,vs33
+	xxlxor		vs34,vs34,vs34
+	xxlxor		vs35,vs35,vs35
+	xxlxor		vs36,vs36,vs36
+	xxlxor		vs37,vs37,vs37
+	xxlxor		vs38,vs38,vs38
+	xxlxor		vs39,vs39,vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+	xxlxor		vs52,	vs52,	vs52
+	xxlxor		vs53,	vs53,	vs53
+	xxlxor		vs54,	vs54,	vs54
+	xxlxor		vs55,	vs55,	vs55 
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+	xxlxor		vs60,	vs60,	vs60
+	xxlxor		vs61,	vs61,	vs61
+	xxlxor		vs62,	vs62,	vs62
+	xxlxor		vs63,	vs63,	vs63	
+.endif
+.endm
+
+  
+#define unit_size 8
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+
+.macro KERNEL4x16_L1_L2  Index,IsLast
+  KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0
+.endm
+
+
+
+.macro KERNEL4x16_I1_L2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L2_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_I2_L2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L2_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I  \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L2_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_L1_L2_I  AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete
+
+.if \First ==1
+	xvmuldp		vs32,	vs0,	vs24
+	xvmuldp		vs33,	vs1,	vs24
+	xvmuldp		vs34,	vs2,	vs24
+	xvmuldp		vs35,	vs3,	vs24
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+.endif
+	lxv	vs8,	DISP32(\Index,0+\OffsetA)(\AREG)
+	lxv	vs9,	DISP32(\Index,16+\OffsetA)(\AREG)
+	lxv	vs10,	DISP32(\Index,32+\OffsetA)(\AREG)
+	lxv	vs11,	DISP32(\Index,48+\OffsetA)(\AREG)
+.if \First ==1
+	xvmuldp		vs36,	vs4,	vs24
+	xvmuldp		vs37,	vs5,	vs24
+	xvmuldp		vs38,	vs6,	vs24
+	xvmuldp		vs39,	vs7,	vs24
+.else
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+.endif
+	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(\BREG)
+	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(\BREG)
+	xxpermdi	vs29,	vs28,	vs28,2	
+	xxpermdi	vs31,	vs30,	vs30,2
+.if \First ==1
+	xvmuldp		vs40,	vs0,	vs25
+	xvmuldp		vs41,	vs1,	vs25
+	xvmuldp		vs42,	vs2,	vs25
+	xvmuldp		vs43,	vs3,	vs25
+
+
+	xvmuldp		vs44,	vs4,	vs25
+	xvmuldp		vs45,	vs5,	vs25
+	xvmuldp		vs46,	vs6,	vs25
+	xvmuldp		vs47,	vs7,	vs25
+
+
+	xvmuldp		vs48,	vs0,	vs26
+	xvmuldp		vs49,	vs1,	vs26
+	xvmuldp		vs50,	vs2,	vs26
+	xvmuldp		vs51,	vs3,	vs26
+
+
+.else
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+.endif
+	lxv	vs12,  DISP32(\Index,64+\OffsetA)(\AREG)
+	lxv	vs13,  DISP32(\Index,80+\OffsetA)(\AREG)
+.if \First ==1
+	xvmuldp		vs52,	vs4,	vs26
+	xvmuldp		vs53,	vs5,	vs26
+	xvmuldp		vs54,	vs6,	vs26
+	xvmuldp		vs55,	vs7,	vs26
+
+.else
+	xvmaddadp		vs52,	vs4,	vs26
+	xvmaddadp		vs53,	vs5,	vs26
+	xvmaddadp		vs54,	vs6,	vs26
+	xvmaddadp		vs55,	vs7,	vs26
+.endif
+	lxv	vs14,  DISP32(\Index,96+\OffsetA)(\AREG)
+	lxv	vs15,  DISP32(\Index,112+\OffsetA)(\AREG)
+.if \First ==1
+	xvmuldp		vs56,	vs0,	vs27
+	xvmuldp		vs57,	vs1,	vs27
+	xvmuldp		vs58,	vs2,	vs27
+	xvmuldp		vs59,	vs3,	vs27
+
+ 
+
+	xvmuldp		vs60,	vs4,	vs27
+	xvmuldp		vs61,	vs5,	vs27
+	xvmuldp		vs62,	vs6,	vs27
+	xvmuldp		vs63,	vs7,	vs27
+
+.else
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+ 
+
+	xvmaddadp		vs60,	vs4,	vs27
+	xvmaddadp		vs61,	vs5,	vs27
+	xvmaddadp		vs62,	vs6,	vs27
+	xvmaddadp		vs63,	vs7,	vs27
+.endif
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+.if \Complete==0
+	lxv	vs0,	DISP32(\Index,128+\OffsetA)(\AREG)
+	lxv	vs1,	DISP32(\Index,144+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+.if \Complete==0
+	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(\BREG)
+	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(\BREG)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+.endif
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+.if \Complete==0
+	lxv	vs2,	DISP32(\Index,160+\OffsetA)(\AREG)
+	lxv	vs3,	DISP32(\Index,176+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+.if \Complete==0
+	lxv	vs4,	DISP32(\Index,192+\OffsetA)(\AREG)
+	lxv	vs5,	DISP32(\Index,208+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs52,	vs12,	vs30
+	xvmaddadp		vs53,	vs13,	vs30
+	xvmaddadp		vs54,	vs14,	vs30
+	xvmaddadp		vs55,	vs15,	vs30
+.if \Complete==0
+	lxv	vs6,	DISP32(\Index,224+\OffsetA)(\AREG)
+	lxv	vs7,	DISP32(\Index,240+\OffsetA)(\AREG)
+.endif
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+ 
+
+	xvmaddadp		vs60,	vs12,	vs31
+	
+	xvmaddadp		vs61,	vs13,	vs31
+	xvmaddadp		vs62,	vs14,	vs31
+	
+	xvmaddadp		vs63,	vs15,	vs31
+  .if \IsLast==1	
+  .if \Complete==1
+	addi		\AREG, \AREG, DISP32(\Index,128+\OffsetA)
+	addi		\BREG, \BREG,  DISP8(\Index,32+\OffsetB)
+  .else
+	addi		\AREG, \AREG, DISP32(\Index,256)
+	addi		\BREG, \BREG,  DISP8(\Index,64)
+  .endif
+  .endif
+  
+
+.endm
+
+ 
+
+.macro KERNEL4x16 First
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO) 
+
+	lxv	vs4,	64(AO)
+	lxv	vs5,	80(AO)
+	lxv	vs6,	96(AO)
+	lxv	vs7,	112(AO)
+
+
+ 
+	addi		BO, BO, 32
+  addi		AO, AO, 128
+
+.if \First==1
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+	xvmuldp			vs52,	vs4,	vs26
+	xvmuldp			vs53,	vs5,	vs26
+	xvmuldp			vs54,	vs6,	vs26
+	xvmuldp			vs55,	vs7,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+	xvmuldp			vs60,	vs4,	vs27
+	xvmuldp			vs61,	vs5,	vs27
+	xvmuldp			vs62,	vs6,	vs27
+	xvmuldp			vs63,	vs7,	vs27
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+ 
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+ 
+	xvmaddadp		vs52,	vs4,	vs26
+	xvmaddadp		vs53,	vs5,	vs26
+	xvmaddadp		vs54,	vs6,	vs26
+	xvmaddadp		vs55,	vs7,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+	xvmaddadp		vs60,	vs4,	vs27
+	xvmaddadp		vs61,	vs5,	vs27
+	xvmaddadp		vs62,	vs6,	vs27
+	xvmaddadp		vs63,	vs7,	vs27
+
+.endif
+.endm
+
+.macro SAVE4x16_REGS
+	add		C2,	CO,	LDC
+	add		C3,	C2,	LDC
+	add		C4,	C3,	LDC
+.endm
+
+.macro SAVE4x16
+#ifndef TRMMKERNEL
+	lxv		vs0,	0(CO)
+	lxv		vs2,	16(CO)
+	lxv		vs4,	32(CO)
+	lxv		vs6,	48(CO)
+#endif	
+	xxpermdi  vs8, vs40,vs32,1
+ 	xxpermdi  vs9 ,vs32,vs40,1
+#ifndef TRMMKERNEL
+	lxv		vs24,	64(CO)
+	lxv		vs26,	80(CO)
+	lxv		vs28,	96(CO)
+	lxv		vs30,	112(CO)
+#endif	
+	xxpermdi  vs10, vs41,vs33,1		 
+ 	xxpermdi  vs11 ,vs33,vs41,1
+#ifndef TRMMKERNEL	 
+	lxv		vs1,	0(C2)
+	lxv		vs3,	16(C2)
+	lxv		vs5,	32(C2)
+	lxv		vs7,	48(C2)
+#endif	
+	xxpermdi  vs12, vs42,vs34,1
+ 	xxpermdi  vs13 ,vs34,vs42,1
+#ifndef TRMMKERNEL
+	lxv		vs25,	64(C2)
+	lxv		vs27,	80(C2)
+#endif	
+	xxpermdi  vs14, vs43,vs35,1		 
+ 	xxpermdi  vs15 ,vs35,vs43,1	
+#ifndef TRMMKERNEL	 
+	lxv		vs29,	96(C2)
+	lxv		vs31,	112(C2)	
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+
+#endif
+	xxpermdi  vs8, vs44,vs36,1
+ 	xxpermdi  vs9 ,vs36,vs44,1
+	xxpermdi  vs10, vs45,vs37,1		 
+ 	xxpermdi  vs11 ,vs37,vs45,1
+#ifndef TRMMKERNEL
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+#endif
+	xxpermdi  vs12, vs46,vs38,1
+ 	xxpermdi  vs13 ,vs38,vs46,1
+	xxpermdi  vs14, vs47,vs39,1		 
+ 	xxpermdi  vs15 ,vs39,vs47,1
+
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs24,	vs8,	alpha_r 
+	xvmaddadp	vs25,	vs9,	alpha_r 
+	xvmaddadp	vs26,	vs10,	alpha_r 
+	xvmaddadp	vs27,	vs11,	alpha_r 
+
+	xvmaddadp	vs28,	vs12,	alpha_r 
+	xvmaddadp	vs29,	vs13,	alpha_r 
+	xvmaddadp	vs30,	vs14,	alpha_r 
+	xvmaddadp	vs31,	vs15,	alpha_r 
+#else
+	xvmuldp	vs24,	vs8,	alpha_r 
+	xvmuldp	vs25,	vs9,	alpha_r 
+	xvmuldp	vs26,	vs10,	alpha_r 
+	xvmuldp	vs27,	vs11,	alpha_r 
+
+	xvmuldp	vs28,	vs12,	alpha_r 
+	xvmuldp	vs29,	vs13,	alpha_r 
+	xvmuldp	vs30,	vs14,	alpha_r 
+	xvmuldp	vs31,	vs15,	alpha_r 
+
+#endif
+	stxv		vs0,	0(CO)
+	stxv		vs2,	16(CO)
+	stxv		vs4,	32(CO)
+	stxv		vs6,	48(CO)
+
+	stxv		vs24,	64(CO)
+	stxv		vs26,	80(CO)
+	stxv		vs28,	96(CO)
+	stxv		vs30,	112(CO)
+
+	stxv		vs1,	0(C2)
+	stxv		vs3,	16(C2)
+	stxv		vs5,	32(C2)
+	stxv		vs7,	48(C2)
+	
+	stxv		vs25,	64(C2)
+	stxv		vs27,	80(C2)
+	stxv		vs29,	96(C2)
+	stxv		vs31,	112(C2)	
+#ifndef TRMMKERNEL
+ 	lxv		vs0,	0(C3)
+	lxv		vs2,	16(C3)
+	lxv		vs4,	32(C3)
+	lxv		vs6,	48(C3)
+#endif	
+	xxpermdi  vs8, vs56,vs48,1
+ 	xxpermdi  vs9 ,vs48,vs56,1
+#ifndef TRMMKERNEL	 
+	lxv		vs24,	64(C3)
+	lxv		vs26,	80(C3)
+#endif	
+	xxpermdi  vs10, vs57,vs49,1		 
+ 	xxpermdi  vs11 ,vs49,vs57,1	
+#ifndef TRMMKERNEL	 
+	lxv		vs28,	96(C3)
+	lxv		vs30,	112(C3)
+#endif	
+	xxpermdi  vs12, vs58,vs50,1
+ 	xxpermdi  vs13 ,vs50,vs58,1
+#ifndef TRMMKERNEL	 
+	lxv		vs1,	0(C4)
+	lxv		vs3,	16(C4)
+#endif	
+	xxpermdi  vs14, vs59,vs51,1		 
+ 	xxpermdi  vs15 ,vs51,vs59,1	
+#ifndef TRMMKERNEL	 
+	lxv		vs5,	32(C4)
+	lxv		vs7,	48(C4)
+
+	lxv		vs25,	64(C4)
+	lxv		vs27,	80(C4)
+	lxv		vs29,	96(C4)
+	lxv		vs31,	112(C4)	
+#endif
+ 
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+
+#endif
+
+	xxpermdi  vs8, vs60,vs52,1
+ 	xxpermdi  vs9 ,vs52,vs60,1
+	xxpermdi  vs10, vs61,vs53,1		 
+ 	xxpermdi  vs11 ,vs53,vs61,1
+#ifndef TRMMKERNEL
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+#endif
+
+
+	xxpermdi  vs12, vs62,vs54,1
+ 	xxpermdi  vs13 ,vs54,vs62,1
+	xxpermdi  vs14, vs63,vs55,1		 
+ 	xxpermdi  vs15 ,vs55,vs63,1
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs24,	vs8,	alpha_r 
+	xvmaddadp	vs25,	vs9,	alpha_r 
+	xvmaddadp	vs26,	vs10,	alpha_r 
+	xvmaddadp	vs27,	vs11,	alpha_r 
+
+	xvmaddadp	vs28,	vs12,	alpha_r 
+	xvmaddadp	vs29,	vs13,	alpha_r 
+	xvmaddadp	vs30,	vs14,	alpha_r 
+	xvmaddadp	vs31,	vs15,	alpha_r 
+#else
+	xvmuldp	vs24,	vs8,	alpha_r 
+	xvmuldp	vs25,	vs9,	alpha_r 
+	xvmuldp	vs26,	vs10,	alpha_r 
+	xvmuldp	vs27,	vs11,	alpha_r 
+
+	xvmuldp	vs28,	vs12,	alpha_r 
+	xvmuldp	vs29,	vs13,	alpha_r 
+	xvmuldp	vs30,	vs14,	alpha_r 
+	xvmuldp	vs31,	vs15,	alpha_r 
+#endif
+ 	stxv		vs0,	0(C3)
+	stxv		vs2,	16(C3)
+	stxv		vs4,	32(C3)
+	stxv		vs6,	48(C3)
+
+	stxv		vs24,	64(C3)
+	stxv		vs26,	80(C3)
+	stxv		vs28,	96(C3)
+	stxv		vs30,	112(C3)
+
+	stxv		vs1,	0(C4)
+	stxv		vs3,	16(C4)
+	stxv		vs5,	32(C4)
+	stxv		vs7,	48(C4)
+	
+	stxv		vs25,	64(C4)
+	stxv		vs27,	80(C4)
+	stxv		vs29,	96(C4)
+	stxv		vs31,	112(C4)	
+
+	addi		CO,	CO,	128
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD4x8_1
+   LOAD4x8 1
+.endm
+
+.macro LOAD4x8_0
+   LOAD4x8 0
+.endm
+.macro LOAD4x8  Zero
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+ 
+
+
+.if \Zero==1 
+    xxlxor		vs32,vs32,vs32
+    xxlxor		vs33,vs33,vs33
+	xxlxor		vs34,vs34,vs34
+	xxlxor		vs35,vs35,vs35
+
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+
+.endif
+.endm
+
+  
+ 
+.macro KERNEL4x8_L1_L2  Index,IsLast
+  KERNEL4x8_L1_L2_I  0,0,0, \Index,\IsLast,0
+.endm
+
+
+
+.macro KERNEL4x8_I1_L2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L2_I  1,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L2_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L2_I  0,\OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x8_L1_L2_I  First, OffsetA,OffsetB, Index,IsLast ,Complete
+
+	lxv	vs8,	DISP16(\Index,0+\OffsetA)(AO)
+	lxv	vs9,	DISP16(\Index,16+\OffsetA)(AO)
+.if \First ==1
+	xvmuldp		vs32,	vs0,	vs24
+	xvmuldp		vs33,	vs1,	vs24
+	xvmuldp		vs34,	vs2,	vs24
+	xvmuldp		vs35,	vs3,	vs24
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+.endif
+
+	lxv	vs10,	DISP16(\Index,32+\OffsetA)(AO)
+	lxv	vs11,	DISP16(\Index,48+\OffsetA)(AO)
+
+
+
+.if \First ==1
+	xvmuldp		vs40,	vs0,	vs25
+	xvmuldp		vs41,	vs1,	vs25
+	xvmuldp		vs42,	vs2,	vs25
+	xvmuldp		vs43,	vs3,	vs25
+
+
+	xvmuldp		vs48,	vs0,	vs26
+	xvmuldp		vs49,	vs1,	vs26
+	xvmuldp		vs50,	vs2,	vs26
+	xvmuldp		vs51,	vs3,	vs26
+
+
+.else
+
+	lxv	vs28,	DISP8(\Index,0  +\OffsetB)(BO)
+	lxv	vs30,	DISP8(\Index,16  +\OffsetB)(BO)
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+
+.endif
+	xxpermdi	vs29,	vs28,	vs28,2	
+	xxpermdi	vs31,	vs30,	vs30,2
+.if \First ==1
+	xvmuldp		vs56,	vs0,	vs27
+	xvmuldp		vs57,	vs1,	vs27
+	xvmuldp		vs58,	vs2,	vs27
+	xvmuldp		vs59,	vs3,	vs27
+
+.else
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+.endif
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+.if \Complete==0
+	lxv	vs0,	DISP16(\Index,64+\OffsetA)(AO)
+	lxv	vs1,	DISP16(\Index,80+\OffsetA)(AO) 
+.endif
+
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.if \Complete==0 
+	lxv	vs2,	DISP16(\Index,96+\OffsetA)(AO)
+	lxv	vs3,	DISP16(\Index,112+\OffsetA)(AO)
+.endif	
+
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+	xvmaddadp		vs50,	vs10,	vs30
+	xvmaddadp		vs51,	vs11,	vs30
+.if \Complete==0
+	lxv	vs24,	DISP8(\Index,32  +\OffsetB)(BO)
+	lxv	vs26,	DISP8(\Index,48  +\OffsetB)(BO) 
+.endif
+ 
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+	xvmaddadp		vs58,	vs10,	vs31
+	xvmaddadp		vs59,	vs11,	vs31
+.if \Complete==0 
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+.endif
+
+  .if \IsLast==1	
+  .if \Complete==1
+	addi		AO, AO, DISP16(\Index,64+\OffsetA)
+	addi		BO, BO,  DISP8(\Index,32+\OffsetB)
+  .else
+	addi		AO, AO, DISP16(\Index,128)
+	addi		BO, BO,  DISP8(\Index,64)
+  .endif
+  .endif
+  
+
+.endm
+
+ 
+
+.macro KERNEL4x8 First
+
+	lxv	vs24,	0(BO)
+	lxv	vs26,	16(BO)
+	xxpermdi	vs25,	vs24,	vs24,2	
+	xxpermdi	vs27,	vs26,	vs26,2
+
+	lxv	vs0,	0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO) 
+
+
+
+ 
+	addi		BO, BO, 32
+    addi		AO, AO, 64
+
+.if \First==1
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+ 
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+ 
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+	xvmuldp			vs50,	vs2,	vs26
+	xvmuldp			vs51,	vs3,	vs26
+ 
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+	xvmuldp			vs58,	vs2,	vs27
+	xvmuldp			vs59,	vs3,	vs27
+ 
+.else
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+ 
+
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+	xvmaddadp		vs50,	vs2,	vs26
+	xvmaddadp		vs51,	vs3,	vs26
+ 
+
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+	xvmaddadp		vs58,	vs2,	vs27
+	xvmaddadp		vs59,	vs3,	vs27
+
+
+.endif
+.endm
+
+ 
+
+.macro SAVE4x8
+	add		T2,	CO,	LDC
+	add		T3,	T2,	LDC
+	add		T4,	T3,	LDC
+#ifndef TRMMKERNEL
+	lxv		vs0,	0(CO)
+	lxv		vs2,	16(CO)
+#endif	
+	xxpermdi  vs8, vs40,vs32,1
+ 	xxpermdi  vs9 ,vs32,vs40,1
+#ifndef TRMMKERNEL	 
+	lxv		vs4,	32(CO)
+	lxv		vs6,	48(CO)
+#endif	
+	xxpermdi  vs10, vs41,vs33,1		 
+ 	xxpermdi  vs11 ,vs33,vs41,1
+#ifndef TRMMKERNEL	 
+	lxv		vs1,	0(T2)
+	lxv		vs3,	16(T2)
+#endif	
+	xxpermdi  vs12, vs42,vs34,1
+ 	xxpermdi  vs13 ,vs34,vs42,1
+#ifndef TRMMKERNEL	 
+	lxv		vs5,	32(T2)
+	lxv		vs7,	48(T2)
+#endif	
+	xxpermdi  vs14, vs43,vs35,1		 
+ 	xxpermdi  vs15 ,vs35,vs43,1	
+ 
+
+
+#ifndef TRMMKERNEL 
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+
+#endif
+ 
+
+	stxv		vs0,	0(CO)
+	stxv		vs2,	16(CO)
+	stxv		vs4,	32(CO)
+	stxv		vs6,	48(CO)
+
+ 
+	stxv		vs1,	0(T2)
+	stxv		vs3,	16(T2)
+	stxv		vs5,	32(T2)
+	stxv		vs7,	48(T2)
+	
+ 
+	xxpermdi  vs8, vs56,vs48,1
+ 	xxpermdi  vs9 ,vs48,vs56,1
+#ifndef TRMMKERNEL 
+ 	lxv		vs0,	0(T3)
+	lxv		vs2,	16(T3)
+#endif	
+	xxpermdi  vs10, vs57,vs49,1		 
+ 	xxpermdi  vs11 ,vs49,vs57,1	
+#ifndef TRMMKERNEL 	 
+	lxv		vs4,	32(T3)
+	lxv		vs6,	48(T3)
+#endif 
+	xxpermdi  vs12, vs58,vs50,1
+ 	xxpermdi  vs13 ,vs50,vs58,1
+#ifndef TRMMKERNEL 	 
+	lxv		vs1,	0(T4)
+	lxv		vs3,	16(T4)
+#endif	
+	xxpermdi  vs14, vs59,vs51,1		 
+ 	xxpermdi  vs15 ,vs51,vs59,1	
+#ifndef TRMMKERNEL 	 
+	lxv		vs5,	32(T4)
+	lxv		vs7,	48(T4)
+ 
+ 
+	xvmaddadp	vs0,	vs8,	alpha_r 
+	xvmaddadp	vs1,	vs9,	alpha_r 
+	xvmaddadp	vs2,	vs10,	alpha_r 
+	xvmaddadp	vs3,	vs11,	alpha_r 
+	
+
+
+	xvmaddadp	vs4,	vs12,	alpha_r 
+	xvmaddadp	vs5,	vs13,	alpha_r 
+	xvmaddadp	vs6,	vs14,	alpha_r 
+	xvmaddadp	vs7,	vs15,	alpha_r 
+#else
+	xvmuldp	vs0,	vs8,	alpha_r 
+	xvmuldp	vs1,	vs9,	alpha_r 
+	xvmuldp	vs2,	vs10,	alpha_r 
+	xvmuldp	vs3,	vs11,	alpha_r 
+	
+
+
+	xvmuldp	vs4,	vs12,	alpha_r 
+	xvmuldp	vs5,	vs13,	alpha_r 
+	xvmuldp	vs6,	vs14,	alpha_r 
+	xvmuldp	vs7,	vs15,	alpha_r 
+
+#endif
+
+
+ 	stxv		vs0,	0(T3)
+	stxv		vs2,	16(T3)
+	stxv		vs4,	32(T3)
+	stxv		vs6,	48(T3)
+
+ 
+	stxv		vs1,	0(T4)
+	stxv		vs3,	16(T4)
+	stxv		vs5,	32(T4)
+	stxv		vs7,	48(T4)
+	
+ 
+
+	addi		CO,	CO,	64
+.endm
+
+
+/*********************************************************************
+* Macros for N=4, M=4                                                *
+*********************************************************************/
+
+.macro LOAD4x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+
+.endm
+
+.macro KERNEL4x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+	xvmaddadp		vs49,	vs9,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+	xvmaddadp		vs57,	vs9,	vs31
+
+.endm
+
+.macro KERNEL4x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+	xvmuldp			vs49,	vs1,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+	xvmuldp			vs57,	vs1,	vs27
+
+.endm
+
+.macro KERNEL4x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+	xvmaddadp		vs49,	vs1,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+	xvmaddadp		vs57,	vs1,	vs27
+
+.endm
+
+.macro SAVE4x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+	xvmaddadp	vs1,	vs49,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+	xvmuldp		vs1,	vs49,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+	xvmaddadp	vs9,	vs57,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+	xvmuldp		vs9,	vs57,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=2                                                *
+*********************************************************************/
+
+.macro LOAD4x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+	lxvdsx	vs30,	o16,	BO
+	lxvdsx	vs31,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+	xvmaddadp		vs48,	vs8,	vs30
+
+	xvmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+	xvmuldp			vs48,	vs0,	vs26
+
+	xvmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+	lxvdsx	vs26,	o16,	BO
+	lxvdsx	vs27,	o24,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 32
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+	xvmaddadp		vs48,	vs0,	vs26
+
+	xvmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro SAVE4x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs48,	alpha_r
+#else
+	xvmuldp		vs0,	vs48,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs56,	alpha_r
+#else
+	xvmuldp		vs8,	vs56,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=1                                                *
+*********************************************************************/
+
+.macro LOAD4x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+.endm
+
+.macro KERNEL4x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+	lxsdx	vs30,	o16,	BO
+	lxsdx	vs31,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+	xsmuldp			vs48,	vs0,	vs26
+
+	xsmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+	lxsdx	vs30,	o16,	BO
+	lxsdx	vs31,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+	xsmaddadp		vs48,	vs0,	vs26
+
+	xsmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+	xsmaddadp		vs48,	vs8,	vs30
+
+	xsmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+	xsmaddadp		vs48,	vs8,	vs30
+
+	xsmaddadp		vs56,	vs8,	vs31
+
+.endm
+
+.macro KERNEL4x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+	xsmuldp			vs48,	vs0,	vs26
+
+	xsmuldp			vs56,	vs0,	vs27
+
+.endm
+
+.macro KERNEL4x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+	lxsdx	vs26,	o16,	BO
+	lxsdx	vs27,	o24,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 32
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+	xsmaddadp		vs48,	vs0,	vs26
+
+	xsmaddadp		vs56,	vs0,	vs27
+
+.endm
+
+.macro SAVE4x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs40,	alpha_r
+#else
+	xsmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs48,	alpha_r
+#else
+	xsmuldp		vs0,	vs48,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs56,	alpha_r
+#else
+	xsmuldp		vs8,	vs56,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=16                                               *
+*********************************************************************/
+
+.macro LOAD2x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+.endm
+
+.macro KERNEL2x16_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+.endm
+
+.macro KERNEL2x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+	xvmaddadp		vs44,	vs12,	vs29
+	xvmaddadp		vs45,	vs13,	vs29
+	xvmaddadp		vs46,	vs14,	vs29
+	xvmaddadp		vs47,	vs15,	vs29
+
+.endm
+
+.macro KERNEL2x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+	xvmuldp			vs44,	vs4,	vs25
+	xvmuldp			vs45,	vs5,	vs25
+	xvmuldp			vs46,	vs6,	vs25
+	xvmuldp			vs47,	vs7,	vs25
+
+.endm
+
+.macro KERNEL2x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+	xvmaddadp		vs44,	vs4,	vs25
+	xvmaddadp		vs45,	vs5,	vs25
+	xvmaddadp		vs46,	vs6,	vs25
+	xvmaddadp		vs47,	vs7,	vs25
+
+.endm
+
+.macro SAVE2x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	add		T1,	T1,	LDC
+	add		T2,	T2,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+
+	lxvd2x		vs12,	0,	T2
+	lxvd2x		vs13,	o16,	T2
+	lxvd2x		vs14,	o32,	T2
+	lxvd2x		vs15,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+	xvmaddadp	vs12,	vs44,	alpha_r
+	xvmaddadp	vs13,	vs45,	alpha_r
+	xvmaddadp	vs14,	vs46,	alpha_r
+	xvmaddadp	vs15,	vs47,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+	xvmuldp		vs12,	vs44,	alpha_r
+	xvmuldp		vs13,	vs45,	alpha_r
+	xvmuldp		vs14,	vs46,	alpha_r
+	xvmuldp		vs15,	vs47,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	stxvd2x		vs12,	0,	T2
+	stxvd2x		vs13,	o16,	T2
+	stxvd2x		vs14,	o32,	T2
+	stxvd2x		vs15,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD2x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x8_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.endm
+
+.macro KERNEL2x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+	xvmaddadp		vs42,	vs10,	vs29
+	xvmaddadp		vs43,	vs11,	vs29
+
+.endm
+
+.macro KERNEL2x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+	xvmuldp			vs42,	vs2,	vs25
+	xvmuldp			vs43,	vs3,	vs25
+
+.endm
+
+.macro KERNEL2x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+	xvmaddadp		vs42,	vs2,	vs25
+	xvmaddadp		vs43,	vs3,	vs25
+
+.endm
+
+.macro SAVE2x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+	lxvd2x		vs10,	o32,	T1
+	lxvd2x		vs11,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+	xvmaddadp	vs10,	vs42,	alpha_r
+	xvmaddadp	vs11,	vs43,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+	xvmuldp		vs10,	vs42,	alpha_r
+	xvmuldp		vs11,	vs43,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=4                                                *
+*********************************************************************/
+
+.macro LOAD2x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+.endm
+
+.macro KERNEL2x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+	xvmaddadp		vs41,	vs9,	vs29
+
+.endm
+
+.macro KERNEL2x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+	xvmuldp			vs41,	vs1,	vs25
+
+.endm
+
+.macro KERNEL2x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+	xvmaddadp		vs41,	vs1,	vs25
+
+.endm
+
+.macro SAVE2x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+	lxvd2x		vs9,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+	xvmaddadp	vs9,	vs41,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+	xvmuldp		vs9,	vs41,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+	stxvd2x		vs9,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=2                                                *
+*********************************************************************/
+
+.macro LOAD2x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+	lxvdsx	vs29,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+	xvmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+	xvmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+	lxvdsx	vs25,	o8,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 16
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+	xvmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro SAVE2x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs8,	vs40,	alpha_r
+#else
+	xvmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxvd2x		vs8,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=2, M=1                                                *
+*********************************************************************/
+
+.macro LOAD2x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+.endm
+
+.macro KERNEL2x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+	lxsdx	vs29,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+	xsmaddadp		vs40,	vs8,	vs29
+
+.endm
+
+.macro KERNEL2x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+	xsmuldp			vs40,	vs0,	vs25
+
+.endm
+
+.macro KERNEL2x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+	lxsdx	vs25,	o8,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 16
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+	xsmaddadp		vs40,	vs0,	vs25
+
+.endm
+
+.macro SAVE2x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	add		T1,	T1,	LDC
+
+#ifndef TRMMKERNEL
+	lxsdx		vs8,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs8,	vs40,	alpha_r
+#else
+	xsmuldp		vs8,	vs40,	alpha_r
+#endif
+
+	stxsdx		vs8,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=16                                               *
+*********************************************************************/
+
+.macro LOAD1x16_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+.endm
+
+.macro KERNEL1x16_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs12,	0,	AO
+	lxvd2x	vs13,	o16,	AO
+	lxvd2x	vs14,	o32,	AO
+	lxvd2x	vs15,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+.endm
+
+.macro KERNEL1x16_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+	xvmaddadp		vs36,	vs12,	vs28
+	xvmaddadp		vs37,	vs13,	vs28
+	xvmaddadp		vs38,	vs14,	vs28
+	xvmaddadp		vs39,	vs15,	vs28
+
+.endm
+
+.macro KERNEL1x16_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+	xvmuldp			vs36,	vs4,	vs24
+	xvmuldp			vs37,	vs5,	vs24
+	xvmuldp			vs38,	vs6,	vs24
+	xvmuldp			vs39,	vs7,	vs24
+
+.endm
+
+.macro KERNEL1x16_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+	lxvd2x	vs4,	0,	AO
+	lxvd2x	vs5,	o16,	AO
+	lxvd2x	vs6,	o32,	AO
+	lxvd2x	vs7,	o48,	AO
+
+	addi		AO, AO, 64
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+	xvmaddadp		vs36,	vs4,	vs24
+	xvmaddadp		vs37,	vs5,	vs24
+	xvmaddadp		vs38,	vs6,	vs24
+	xvmaddadp		vs39,	vs7,	vs24
+
+.endm
+
+.macro SAVE1x16
+
+	mr		T1,	CO
+	addi		T2,	T1,	64
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+
+	lxvd2x		vs4,	0,	T2
+	lxvd2x		vs5,	o16,	T2
+	lxvd2x		vs6,	o32,	T2
+	lxvd2x		vs7,	o48,	T2
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+	xvmaddadp	vs4,	vs36,	alpha_r
+	xvmaddadp	vs5,	vs37,	alpha_r
+	xvmaddadp	vs6,	vs38,	alpha_r
+	xvmaddadp	vs7,	vs39,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+	xvmuldp		vs4,	vs36,	alpha_r
+	xvmuldp		vs5,	vs37,	alpha_r
+	xvmuldp		vs6,	vs38,	alpha_r
+	xvmuldp		vs7,	vs39,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	stxvd2x		vs4,	0,	T2
+	stxvd2x		vs5,	o16,	T2
+	stxvd2x		vs6,	o32,	T2
+	stxvd2x		vs7,	o48,	T2
+
+	addi		CO,	CO,	128
+
+.endm
+
+/*********************************************************************
+* Macros for N=4, M=8                                                *
+*********************************************************************/
+
+.macro LOAD1x8_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x8_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+	lxvd2x	vs10,	o32,	AO
+	lxvd2x	vs11,	o48,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+.endm
+
+.macro KERNEL1x8_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+	xvmaddadp		vs34,	vs10,	vs28
+	xvmaddadp		vs35,	vs11,	vs28
+
+.endm
+
+.macro KERNEL1x8_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+	xvmuldp			vs34,	vs2,	vs24
+	xvmuldp			vs35,	vs3,	vs24
+
+.endm
+
+.macro KERNEL1x8_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+	lxvd2x	vs2,	o32,	AO
+	lxvd2x	vs3,	o48,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 64
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+	xvmaddadp		vs34,	vs2,	vs24
+	xvmaddadp		vs35,	vs3,	vs24
+
+.endm
+
+.macro SAVE1x8
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+	lxvd2x		vs2,	o32,	T1
+	lxvd2x		vs3,	o48,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+	xvmaddadp	vs2,	vs34,	alpha_r
+	xvmaddadp	vs3,	vs35,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+	xvmuldp		vs2,	vs34,	alpha_r
+	xvmuldp		vs3,	vs35,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+
+	addi		CO,	CO,	64
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=4                                                *
+*********************************************************************/
+
+.macro LOAD1x4_1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x4_I1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_1
+
+	lxvd2x	vs8,	0,	AO
+	lxvd2x	vs9,	o16,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_2
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+.endm
+
+.macro KERNEL1x4_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+	xvmaddadp		vs33,	vs9,	vs28
+
+.endm
+
+.macro KERNEL1x4_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+	xvmuldp			vs33,	vs1,	vs24
+
+.endm
+
+.macro KERNEL1x4_SUB1
+
+	lxvd2x	vs0,	0,	AO
+	lxvd2x	vs1,	o16,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 32
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+	xvmaddadp		vs33,	vs1,	vs24
+
+.endm
+
+.macro SAVE1x4
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+	lxvd2x		vs1,	o16,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+	xvmaddadp	vs1,	vs33,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+	xvmuldp		vs1,	vs33,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+	stxvd2x		vs1,	o16,	T1
+
+	addi		CO,	CO,	32
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=2                                                *
+*********************************************************************/
+
+.macro LOAD1x2_1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x2_I1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_1
+
+	lxvd2x	vs8,	0,	AO
+
+	lxvdsx	vs28,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_2
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x2_E2
+
+
+	xvmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x2_SUBI1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x2_SUB1
+
+	lxvd2x	vs0,	0,	AO
+
+	lxvdsx	vs24,	0,	BO
+
+	addi		AO, AO, 16
+	addi		BO, BO, 8
+
+
+	xvmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro SAVE1x2
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxvd2x		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xvmaddadp	vs0,	vs32,	alpha_r
+#else
+	xvmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxvd2x		vs0,	0,	T1
+
+	addi		CO,	CO,	16
+
+.endm
+
+/*********************************************************************
+* Macros for N=1, M=1                                                *
+*********************************************************************/
+
+.macro LOAD1x1_1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+.endm
+
+.macro KERNEL1x1_I1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_1
+
+	lxsdx	vs8,	0,	AO
+
+	lxsdx	vs28,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_2
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x1_E2
+
+
+	xsmaddadp		vs32,	vs8,	vs28
+
+.endm
+
+.macro KERNEL1x1_SUBI1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmuldp			vs32,	vs0,	vs24
+
+.endm
+
+.macro KERNEL1x1_SUB1
+
+	lxsdx	vs0,	0,	AO
+
+	lxsdx	vs24,	0,	BO
+
+	addi		AO, AO, 8
+	addi		BO, BO, 8
+
+
+	xsmaddadp		vs32,	vs0,	vs24
+
+.endm
+
+.macro SAVE1x1
+
+	mr		T1,	CO
+
+#ifndef TRMMKERNEL
+	lxsdx		vs0,	0,	T1
+#endif
+
+#ifndef TRMMKERNEL
+	xsmaddadp	vs0,	vs32,	alpha_r
+#else
+	xsmuldp		vs0,	vs32,	alpha_r
+#endif
+
+	stxsdx		vs0,	0,	T1
+
+	addi		CO,	CO,	8
+
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	7			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	6			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	5			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	4			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	3			 
+		.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+
+    #endif
+
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
 .endm
\ No newline at end of file
diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c
index bd74d20e5..58dcdec5a 100644
--- a/kernel/power/icamax.c
+++ b/kernel/power/icamax.c
@@ -1,328 +1,328 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
-
-#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code
-
-#if  !defined(USE_MASK_PERMUTATIONS)
-
-static inline __attribute__((always_inline))  __vector float mvec_mergee(__vector float a,__vector float b ){
-  __vector float result;
-  __asm__ ( 
-      "vmrgew %0,%1,%2;\n" 
-      : "=v" (result) 
-      : "v" (a), 
-      "v" (b) 
-      : );
-  return result;
-}
-
-static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){
-  __vector float result;
-  __asm__ ( 
-      "vmrgow %0,%1,%2;\n" 
-      : "=v" (result) 
-      : "v" (a), 
-      "v" (b) 
-      : );
-  return result;
-}
-
-#endif
-
-/**
- * Find  maximum index 
- * Warning: requirements n>0  and n % 32 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param maxf  (out) maximum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG   ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
-
-    BLASLONG index;
-    BLASLONG i=0;
-#if  defined(USE_MASK_PERMUTATIONS)    
-    register __vector unsigned int static_index0 = {0,1,2,3};
-#else
-    register __vector unsigned int static_index0 = {2,0,3,1};
-#endif    
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0; 
-    register __vector unsigned int static_index2=static_index0 +temp1; 
-    register __vector unsigned int static_index3=static_index1 +temp1;  
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
-    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
-    register __vector float quadruple_values={0,0,0,0};
-
-    register __vector float * v_ptrx=(__vector float *)x;
-#if  defined(USE_MASK_PERMUTATIONS)    
-    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
-    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
-#endif    
-    for(; i<n; i+=32 ){
-       //absolute temporary complex vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-#if defined(USE_MASK_PERMUTATIONS)       
-       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
-       register __vector float ti=vec_perm(v0,v1,image_pack_mask); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
-       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-#else
-       register __vector float t1=mvec_mergee(v0,v1);
-       register __vector float ti=mvec_mergeo(v0,v1); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       register __vector float t2= mvec_mergee(v2,v3);
-       register __vector float ti2=mvec_mergeo(v2,v3); 
-       v1=t2+ti2;
-       t1=mvec_mergee(v4,v5);
-       ti=mvec_mergeo(v4,v5);      
-       v2=t1+ti; //sum
-       t2=mvec_mergee(v6,v7);
-       ti2=mvec_mergeo(v6,v7); 
-       v3=t2+ti2;
-
-#endif
-       // now we have 16 summed elements . lets compare them
-       v_ptrx+=8;
-       register __vector bool int r1=vec_cmpgt(v1,v0);
-       register __vector bool int r2=vec_cmpgt(v3,v2);
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for first 16 values
-       r1=vec_cmpgt(v1,v0);
-       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1); 
-
-       //absolute temporary complex vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-#if defined(USE_MASK_PERMUTATIONS)       
-       t1=vec_perm(v0,v1,real_pack_mask);
-       ti=vec_perm(v0,v1,image_pack_mask); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       t2=vec_perm(v2,v3,real_pack_mask);
-       ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-#else
-       t1=mvec_mergee(v0,v1);
-       ti=mvec_mergeo(v0,v1); 
-            
-       v0=t1+ti; //sum quadruple real with quadruple image
-       t2=mvec_mergee(v2,v3);
-       ti2=mvec_mergeo(v2,v3); 
-       v1=t2+ti2;
-       t1=mvec_mergee(v4,v5);
-       ti=mvec_mergeo(v4,v5);      
-       v2=t1+ti; //sum
-       t2=mvec_mergee(v6,v7);
-       ti2=mvec_mergeo(v6,v7); 
-       v3=t2+ti2;
-
-#endif
-       // now we have 16 summed elements {from 16 to 31} . lets compare them
-       v_ptrx+=8;
-       r1=vec_cmpgt(v1,v0);
-       r2=vec_cmpgt(v3,v2);
-       ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for the second 16 values
-       r1=vec_cmpgt(v1,v0);
-       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1); 
-       indv0+=temp1; //make index from 16->31
-
-       //find final quadruple from 32 elements
-       r2=vec_cmpgt(vv0,vf0);
-       ind2 = vec_sel( indf0,indv0,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-       //get asbolute index
-       ind2+=temp0;
-       //compare with old quadruple and update 
-       r1=vec_cmpgt(vv0,quadruple_values);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
-       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
-
-       temp0+=temp_add;     
-    }
-
-    //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the maximum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-      index=i1>i2?i2:i1;
-    }else if(a2>a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4>a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-       index=i1>index?index:i1;
-       *maxf=a1; 
-    }else if(a3>a1){
-       index=i1;
-       *maxf=a3;
-    }else{ 
-        *maxf=a1;
-    }
-    return index; 
-
-}
- 
-  
-
- 
- 
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
-{
-    BLASLONG i = 0;
-    BLASLONG ix = 0;
-    FLOAT maxf = 0;
-    BLASLONG max = 0;
-    BLASLONG inc_x2;
-
-    if (n <= 0 || inc_x <= 0) return(max);
-     
-    if (inc_x == 1) {
-
-      BLASLONG n1 = n & -32;
-      if (n1 > 0) {
-
-            max = ciamax_kernel_32(n1, x, &maxf); 
-            i = n1;
-            ix = n1 << 1;
-      }
-
-      while(i < n)
-    {
-        if( CABS1(x,ix) > maxf )
-        {
-            max = i;
-            maxf = CABS1(x,ix);
-        }
-        ix += 2;
-        i++;
-    }
-        return (max + 1);
-
-    } else {
- 
-      inc_x2 = 2 * inc_x;
-
-    maxf = CABS1(x,0);
-    ix += inc_x2;
-    i++;
-
-    while(i < n)
-    {
-        if( CABS1(x,ix) > maxf )
-        {
-            max = i;
-            maxf = CABS1(x,ix);
-        }
-        ix += inc_x2;
-        i++;
-    }
-        return (max + 1);
-    }
- 
-}
-
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
+
+#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code
+
+#if  !defined(USE_MASK_PERMUTATIONS)
+
+static inline __attribute__((always_inline))  __vector float mvec_mergee(__vector float a,__vector float b ){
+  __vector float result;
+  __asm__ ( 
+      "vmrgew %0,%1,%2;\n" 
+      : "=v" (result) 
+      : "v" (a), 
+      "v" (b) 
+      : );
+  return result;
+}
+
+static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){
+  __vector float result;
+  __asm__ ( 
+      "vmrgow %0,%1,%2;\n" 
+      : "=v" (result) 
+      : "v" (a), 
+      "v" (b) 
+      : );
+  return result;
+}
+
+#endif
+
+/**
+ * Find  maximum index 
+ * Warning: requirements n>0  and n % 32 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param maxf  (out) maximum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG   ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { 
+
+    BLASLONG index;
+    BLASLONG i=0;
+#if  defined(USE_MASK_PERMUTATIONS)    
+    register __vector unsigned int static_index0 = {0,1,2,3};
+#else
+    register __vector unsigned int static_index0 = {2,0,3,1};
+#endif    
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0; 
+    register __vector unsigned int static_index2=static_index0 +temp1; 
+    register __vector unsigned int static_index3=static_index1 +temp1;  
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    register __vector float quadruple_values={0,0,0,0};
+
+    register __vector float * v_ptrx=(__vector float *)x;
+#if  defined(USE_MASK_PERMUTATIONS)    
+    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
+    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
+#endif    
+    for(; i<n; i+=32 ){
+       //absolute temporary complex vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+#if defined(USE_MASK_PERMUTATIONS)       
+       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
+       register __vector float ti=vec_perm(v0,v1,image_pack_mask); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
+       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+#else
+       register __vector float t1=mvec_mergee(v0,v1);
+       register __vector float ti=mvec_mergeo(v0,v1); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2= mvec_mergee(v2,v3);
+       register __vector float ti2=mvec_mergeo(v2,v3); 
+       v1=t2+ti2;
+       t1=mvec_mergee(v4,v5);
+       ti=mvec_mergeo(v4,v5);      
+       v2=t1+ti; //sum
+       t2=mvec_mergee(v6,v7);
+       ti2=mvec_mergeo(v6,v7); 
+       v3=t2+ti2;
+
+#endif
+       // now we have 16 summed elements . lets compare them
+       v_ptrx+=8;
+       register __vector bool int r1=vec_cmpgt(v1,v0);
+       register __vector bool int r2=vec_cmpgt(v3,v2);
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for first 16 values
+       r1=vec_cmpgt(v1,v0);
+       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1); 
+
+       //absolute temporary complex vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+#if defined(USE_MASK_PERMUTATIONS)       
+       t1=vec_perm(v0,v1,real_pack_mask);
+       ti=vec_perm(v0,v1,image_pack_mask); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=vec_perm(v2,v3,real_pack_mask);
+       ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+#else
+       t1=mvec_mergee(v0,v1);
+       ti=mvec_mergeo(v0,v1); 
+            
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=mvec_mergee(v2,v3);
+       ti2=mvec_mergeo(v2,v3); 
+       v1=t2+ti2;
+       t1=mvec_mergee(v4,v5);
+       ti=mvec_mergeo(v4,v5);      
+       v2=t1+ti; //sum
+       t2=mvec_mergee(v6,v7);
+       ti2=mvec_mergeo(v6,v7); 
+       v3=t2+ti2;
+
+#endif
+       // now we have 16 summed elements {from 16 to 31} . lets compare them
+       v_ptrx+=8;
+       r1=vec_cmpgt(v1,v0);
+       r2=vec_cmpgt(v3,v2);
+       ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for the second 16 values
+       r1=vec_cmpgt(v1,v0);
+       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1); 
+       indv0+=temp1; //make index from 16->31
+
+       //find final quadruple from 32 elements
+       r2=vec_cmpgt(vv0,vf0);
+       ind2 = vec_sel( indf0,indv0,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+       //get asbolute index
+       ind2+=temp0;
+       //compare with old quadruple and update 
+       r1=vec_cmpgt(vv0,quadruple_values);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
+       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
+
+       temp0+=temp_add;     
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the maximum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+      index=i1>i2?i2:i1;
+    }else if(a2>a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4>a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+       index=i1>index?index:i1;
+       *maxf=a1; 
+    }else if(a3>a1){
+       index=i1;
+       *maxf=a3;
+    }else{ 
+        *maxf=a1;
+    }
+    return index; 
+
+}
+ 
+  
+
+ 
+ 
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ix = 0;
+    FLOAT maxf = 0;
+    BLASLONG max = 0;
+    BLASLONG inc_x2;
+
+    if (n <= 0 || inc_x <= 0) return(max);
+     
+    if (inc_x == 1) {
+
+      BLASLONG n1 = n & -32;
+      if (n1 > 0) {
+
+            max = ciamax_kernel_32(n1, x, &maxf); 
+            i = n1;
+            ix = n1 << 1;
+      }
+
+      while(i < n)
+    {
+        if( CABS1(x,ix) > maxf )
+        {
+            max = i;
+            maxf = CABS1(x,ix);
+        }
+        ix += 2;
+        i++;
+    }
+        return (max + 1);
+
+    } else {
+ 
+      inc_x2 = 2 * inc_x;
+
+    maxf = CABS1(x,0);
+    ix += inc_x2;
+    i++;
+
+    while(i < n)
+    {
+        if( CABS1(x,ix) > maxf )
+        {
+            max = i;
+            maxf = CABS1(x,ix);
+        }
+        ix += inc_x2;
+        i++;
+    }
+        return (max + 1);
+    }
+ 
+}
+
+
diff --git a/kernel/power/icamin.c b/kernel/power/icamin.c
index 336766245..843370c6c 100644
--- a/kernel/power/icamin.c
+++ b/kernel/power/icamin.c
@@ -1,266 +1,266 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
-
-
-
- 
-/**
- * Find  minimum index 
- * Warning: requirements n>0  and n % 32 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param minf  (out) minimum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG   ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { 
-
-    BLASLONG index;
-    BLASLONG i=0;
-    register __vector unsigned int static_index0 = {0,1,2,3};
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
-    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
-    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
-    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
-    float first_min=CABS1(x,0);
-    register __vector float quadruple_values={first_min,first_min,first_min,first_min};
-
-    register __vector float * v_ptrx=(__vector float *)x;
-    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
-    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
-    for(; i<n; i+=32){
-       //absolute temporary complex vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
-       register __vector float ti=vec_perm(v0,v1,image_pack_mask);      
-       v0=t1+ti; //sum quadruple real with quadruple image
-       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
-       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-       // now we have 16 summed elements . lets compare them
-       v_ptrx+=8;
-       register __vector bool int r1=vec_cmpgt(v0,v1);
-       register __vector bool int r2=vec_cmpgt(v2,v3);
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for first 16 values
-       r1=vec_cmpgt(v0,v1);
-       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1); 
-
-       //absolute temporary complex vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-
-       //pack complex real and imaginary parts together to sum real+image
-       t1=vec_perm(v0,v1,real_pack_mask);
-       ti=vec_perm(v0,v1,image_pack_mask);      
-       v0=t1+ti; //sum quadruple real with quadruple image
-       t2=vec_perm(v2,v3,real_pack_mask);
-       ti2=vec_perm(v2,v3,image_pack_mask); 
-       v1=t2+ti2;
-       t1=vec_perm(v4,v5,real_pack_mask);
-       ti=vec_perm(v4,v5,image_pack_mask);      
-       v2=t1+ti; //sum
-       t2=vec_perm(v6,v7,real_pack_mask);
-       ti2=vec_perm(v6,v7,image_pack_mask); 
-       v3=t2+ti2;
-       // now we have 16 summed elements {from 16 to 31} . lets compare them
-       v_ptrx+=8;
-       r1=vec_cmpgt(v0,v1);
-       r2=vec_cmpgt(v2,v3);
-       ind2= vec_sel(static_index0,static_index1,r1);
-       v0=vec_sel(v0,v1,r1); 
-       ind3= vec_sel(static_index2,static_index3,r2);
-       v1=vec_sel(v2,v3,r2);
-       //final cmp and select index and value for the second 16 values
-       r1=vec_cmpgt(v0,v1);
-       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1); 
-       indv0+=temp1; //make index from 16->31
-
-       //find final quadruple from 32 elements
-       r2=vec_cmpgt(vf0,vv0);
-       ind2 = vec_sel( indf0,indv0,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-       //get asbolute index
-       ind2+=temp0;
-       //compare with old quadruple and update 
-       r1=vec_cmpgt(quadruple_values,vv0);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
-       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
-
-       temp0+=temp_add;     
-    }
-
- //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the minimum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-       index=i1>i2?i2:i1;
-    }else if(a2<a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4<a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-      index=i1>index?index:i1;
-       *minf=a1; 
-    }else if(a3<a1){
-       index=i1;
-       *minf=a3;
-    }else{ 
-        *minf=a1;
-    }
-    return index;
-
-}
- 
-  
-
- 
-
- 
- 
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
-{
-    BLASLONG i=0;
-    BLASLONG ix=0;
-    FLOAT minf;
-    BLASLONG min=0;
-    BLASLONG inc_x2;
-
-    if (n <= 0 || inc_x <= 0) return(min);
-    
-
-    if (inc_x == 1) {
-        minf = CABS1(x,0); //index will not be incremented
-        BLASLONG n1 = n & -32;
-        if (n1 > 0) {
-
-            min = ciamin_kernel_32(n1, x, &minf);
-            i = n1;
-            ix = n1 << 1;
-        }
-      
-
-        while(i < n)
-        {
-            if( CABS1(x,ix) < minf )
-            {
-                min = i;
-                minf = CABS1(x,ix);
-            }
-            ix += 2;
-            i++;
-        }
-        return (min + 1);
-
-    } else {
- 
-        inc_x2 = 2 * inc_x;
-
-        minf = CABS1(x,0);
-        ix += inc_x2;
-        i++;
-
-        while(i < n)
-        {
-            if( CABS1(x,ix) < minf )
-            {
-                min = i;
-                minf = CABS1(x,ix);
-            }
-            ix += inc_x2;
-            i++;
-        }
-        return (min + 1);
-    }
- 
-}
-
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+#define CABS1(x,i)    ABS(x[i])+ABS(x[i+1])
+
+
+
+ 
+/**
+ * Find  minimum index 
+ * Warning: requirements n>0  and n % 32 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param minf  (out) minimum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG   ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { 
+
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    float first_min=CABS1(x,0);
+    register __vector float quadruple_values={first_min,first_min,first_min,first_min};
+
+    register __vector float * v_ptrx=(__vector float *)x;
+    register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; 
+    register __vector unsigned char image_pack_mask=  {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; 
+    for(; i<n; i+=32){
+       //absolute temporary complex vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+       register __vector float t1=vec_perm(v0,v1,real_pack_mask);
+       register __vector float ti=vec_perm(v0,v1,image_pack_mask);      
+       v0=t1+ti; //sum quadruple real with quadruple image
+       register __vector float t2=vec_perm(v2,v3,real_pack_mask);
+       register __vector float ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+       // now we have 16 summed elements . lets compare them
+       v_ptrx+=8;
+       register __vector bool int r1=vec_cmpgt(v0,v1);
+       register __vector bool int r2=vec_cmpgt(v2,v3);
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for first 16 values
+       r1=vec_cmpgt(v0,v1);
+       register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1); 
+
+       //absolute temporary complex vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+
+       //pack complex real and imaginary parts together to sum real+image
+       t1=vec_perm(v0,v1,real_pack_mask);
+       ti=vec_perm(v0,v1,image_pack_mask);      
+       v0=t1+ti; //sum quadruple real with quadruple image
+       t2=vec_perm(v2,v3,real_pack_mask);
+       ti2=vec_perm(v2,v3,image_pack_mask); 
+       v1=t2+ti2;
+       t1=vec_perm(v4,v5,real_pack_mask);
+       ti=vec_perm(v4,v5,image_pack_mask);      
+       v2=t1+ti; //sum
+       t2=vec_perm(v6,v7,real_pack_mask);
+       ti2=vec_perm(v6,v7,image_pack_mask); 
+       v3=t2+ti2;
+       // now we have 16 summed elements {from 16 to 31} . lets compare them
+       v_ptrx+=8;
+       r1=vec_cmpgt(v0,v1);
+       r2=vec_cmpgt(v2,v3);
+       ind2= vec_sel(static_index0,static_index1,r1);
+       v0=vec_sel(v0,v1,r1); 
+       ind3= vec_sel(static_index2,static_index3,r2);
+       v1=vec_sel(v2,v3,r2);
+       //final cmp and select index and value for the second 16 values
+       r1=vec_cmpgt(v0,v1);
+       register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1); 
+       indv0+=temp1; //make index from 16->31
+
+       //find final quadruple from 32 elements
+       r2=vec_cmpgt(vf0,vv0);
+       ind2 = vec_sel( indf0,indv0,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+       //get asbolute index
+       ind2+=temp0;
+       //compare with old quadruple and update 
+       r1=vec_cmpgt(quadruple_values,vv0);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r1);
+       quadruple_values= vec_sel(quadruple_values,vv0,r1);      
+
+       temp0+=temp_add;     
+    }
+
+ //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the minimum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+       index=i1>i2?i2:i1;
+    }else if(a2<a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4<a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+      index=i1>index?index:i1;
+       *minf=a1; 
+    }else if(a3<a1){
+       index=i1;
+       *minf=a3;
+    }else{ 
+        *minf=a1;
+    }
+    return index;
+
+}
+ 
+  
+
+ 
+
+ 
+ 
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i=0;
+    BLASLONG ix=0;
+    FLOAT minf;
+    BLASLONG min=0;
+    BLASLONG inc_x2;
+
+    if (n <= 0 || inc_x <= 0) return(min);
+    
+
+    if (inc_x == 1) {
+        minf = CABS1(x,0); //index will not be incremented
+        BLASLONG n1 = n & -32;
+        if (n1 > 0) {
+
+            min = ciamin_kernel_32(n1, x, &minf);
+            i = n1;
+            ix = n1 << 1;
+        }
+      
+
+        while(i < n)
+        {
+            if( CABS1(x,ix) < minf )
+            {
+                min = i;
+                minf = CABS1(x,ix);
+            }
+            ix += 2;
+            i++;
+        }
+        return (min + 1);
+
+    } else {
+ 
+        inc_x2 = 2 * inc_x;
+
+        minf = CABS1(x,0);
+        ix += inc_x2;
+        i++;
+
+        while(i < n)
+        {
+            if( CABS1(x,ix) < minf )
+            {
+                min = i;
+                minf = CABS1(x,ix);
+            }
+            ix += inc_x2;
+            i++;
+        }
+        return (min + 1);
+    }
+ 
+}
+
+
diff --git a/kernel/power/isamax.c b/kernel/power/isamax.c
index bf1af78d6..fb2dafec0 100644
--- a/kernel/power/isamax.c
+++ b/kernel/power/isamax.c
@@ -1,288 +1,288 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-
-
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-
-/**
- * Find  maximum index 
- * Warning: requirements n>0  and n % 64 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param maxf  (out) maximum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) {
-    BLASLONG index;
-    BLASLONG i=0;
-    register __vector unsigned int static_index0 = {0,1,2,3};
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
-    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
-    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
-    register __vector float quadruple_values={0,0,0,0};
-    register __vector float * v_ptrx=(__vector float *)x;
-    for(; i<n; i+=64){
-       //absolute temporary vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       register __vector bool int r1=vec_cmpgt(v1,v0);
-       register __vector bool int r2=vec_cmpgt(v3,v2);
-       register __vector bool int r3=vec_cmpgt(v5,v4);
-       register __vector bool int r4=vec_cmpgt(v7,v6);
-      
-       //select
-       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1);
-
-       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vf1= vec_sel(v2,v3,r2);
-
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-        r1=vec_cmpgt(vf1,vf0);
-       r2=vec_cmpgt(v1,v0);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_first= vec_sel(ind0_first,ind1,r1);
-       vf0= vec_sel(vf0,vf1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vf1= vec_sel(v0,v1,r2);
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the first 32 values
-       r1=vec_cmpgt(vf1,vf0);
-       ind0_first = vec_sel(ind0_first,ind2,r1);
-       vf0= vec_sel(vf0,vf1,r1);
- 
-       ind0_first+=temp0; //get absolute index
-
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
-       //second part of 32
-       // absolute temporary vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       r1=vec_cmpgt(v1,v0);
-       r2=vec_cmpgt(v3,v2);
-       r3=vec_cmpgt(v5,v4);
-       r4=vec_cmpgt(v7,v6);
-       //select
-       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1);
-
-       ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vv1= vec_sel(v2,v3,r2);
-
-       ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-       r1=vec_cmpgt(vv1,vv0);
-       r2=vec_cmpgt(v1,v0);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_second= vec_sel(ind0_second,ind1,r1);
-       vv0= vec_sel(vv0,vv1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vv1= vec_sel(v0,v1,r2) ;  
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the second 32 values
-       r1=vec_cmpgt(vv1,vv0);
-       ind0_second = vec_sel(ind0_second,ind2,r1);
-       vv0= vec_sel(vv0,vv1,r1);
-
-       ind0_second+=temp0; //get absolute index
-    
-       //find final quadruple from 64 elements
-       r2=vec_cmpgt(vv0,vf0);
-       ind2 = vec_sel( ind0_first,ind0_second,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-
-       //compare with old quadruple and update 
-       r3=vec_cmpgt(vv0,quadruple_values);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
-       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
-
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
- 
-    }
-
-    //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the maximum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-      index=i1>i2?i2:i1;
-    }else if(a2>a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4>a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-       index=i1>index?index:i1;
-       *maxf=a1; 
-    }else if(a3>a1){
-       index=i1;
-       *maxf=a3;
-    }else{ 
-        *maxf=a1;
-    }
-    return index;
-
-}
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
-    BLASLONG i = 0;
-    BLASLONG j = 0;
-    FLOAT maxf = 0.0;
-    BLASLONG max = 0;
-
-    if (n <= 0 || inc_x <= 0) return (max);
-
-    if (inc_x == 1) {
-
-        BLASLONG n1 = n & -64;
-        if (n1 > 0) {
-
-            max = siamax_kernel_64(n1, x, &maxf);
-
-            i = n1;
-        }
-
-        while (i < n) {
-            if (ABS(x[i]) > maxf) {
-                max = i;
-                maxf = ABS(x[i]);
-            }
-            i++;
-        }
-        return (max + 1);
-
-    } else {
-
-        BLASLONG n1 = n & -4;
-        while (j < n1) {
-
-            if (ABS(x[i]) > maxf) {
-                max = j;
-                maxf = ABS(x[i]);
-            }
-            if (ABS(x[i + inc_x]) > maxf) {
-                max = j + 1;
-                maxf = ABS(x[i + inc_x]);
-            }
-            if (ABS(x[i + 2 * inc_x]) > maxf) {
-                max = j + 2;
-                maxf = ABS(x[i + 2 * inc_x]);
-            }
-            if (ABS(x[i + 3 * inc_x]) > maxf) {
-                max = j + 3;
-                maxf = ABS(x[i + 3 * inc_x]);
-            }
-
-            i += inc_x * 4;
-
-            j += 4;
-
-        }
-
-
-        while (j < n) {
-            if (ABS(x[i]) > maxf) {
-                max = j;
-                maxf = ABS(x[i]);
-            }
-            i += inc_x;
-            j++;
-        }
-        return (max + 1);
-    }
-}
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+
+
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+
+/**
+ * Find  maximum index 
+ * Warning: requirements n>0  and n % 64 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param maxf  (out) maximum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) {
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0}
+    register __vector float quadruple_values={0,0,0,0};
+    register __vector float * v_ptrx=(__vector float *)x;
+    for(; i<n; i+=64){
+       //absolute temporary vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       register __vector bool int r1=vec_cmpgt(v1,v0);
+       register __vector bool int r2=vec_cmpgt(v3,v2);
+       register __vector bool int r3=vec_cmpgt(v5,v4);
+       register __vector bool int r4=vec_cmpgt(v7,v6);
+      
+       //select
+       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1);
+
+       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vf1= vec_sel(v2,v3,r2);
+
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+        r1=vec_cmpgt(vf1,vf0);
+       r2=vec_cmpgt(v1,v0);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_first= vec_sel(ind0_first,ind1,r1);
+       vf0= vec_sel(vf0,vf1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vf1= vec_sel(v0,v1,r2);
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the first 32 values
+       r1=vec_cmpgt(vf1,vf0);
+       ind0_first = vec_sel(ind0_first,ind2,r1);
+       vf0= vec_sel(vf0,vf1,r1);
+ 
+       ind0_first+=temp0; //get absolute index
+
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       //second part of 32
+       // absolute temporary vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       r1=vec_cmpgt(v1,v0);
+       r2=vec_cmpgt(v3,v2);
+       r3=vec_cmpgt(v5,v4);
+       r4=vec_cmpgt(v7,v6);
+       //select
+       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1);
+
+       ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vv1= vec_sel(v2,v3,r2);
+
+       ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vv1,vv0);
+       r2=vec_cmpgt(v1,v0);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_second= vec_sel(ind0_second,ind1,r1);
+       vv0= vec_sel(vv0,vv1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vv1= vec_sel(v0,v1,r2) ;  
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the second 32 values
+       r1=vec_cmpgt(vv1,vv0);
+       ind0_second = vec_sel(ind0_second,ind2,r1);
+       vv0= vec_sel(vv0,vv1,r1);
+
+       ind0_second+=temp0; //get absolute index
+    
+       //find final quadruple from 64 elements
+       r2=vec_cmpgt(vv0,vf0);
+       ind2 = vec_sel( ind0_first,ind0_second,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+
+       //compare with old quadruple and update 
+       r3=vec_cmpgt(vv0,quadruple_values);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
+       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
+
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+ 
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the maximum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+      index=i1>i2?i2:i1;
+    }else if(a2>a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4>a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+       index=i1>index?index:i1;
+       *maxf=a1; 
+    }else if(a3>a1){
+       index=i1;
+       *maxf=a3;
+    }else{ 
+        *maxf=a1;
+    }
+    return index;
+
+}
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+    BLASLONG i = 0;
+    BLASLONG j = 0;
+    FLOAT maxf = 0.0;
+    BLASLONG max = 0;
+
+    if (n <= 0 || inc_x <= 0) return (max);
+
+    if (inc_x == 1) {
+
+        BLASLONG n1 = n & -64;
+        if (n1 > 0) {
+
+            max = siamax_kernel_64(n1, x, &maxf);
+
+            i = n1;
+        }
+
+        while (i < n) {
+            if (ABS(x[i]) > maxf) {
+                max = i;
+                maxf = ABS(x[i]);
+            }
+            i++;
+        }
+        return (max + 1);
+
+    } else {
+
+        BLASLONG n1 = n & -4;
+        while (j < n1) {
+
+            if (ABS(x[i]) > maxf) {
+                max = j;
+                maxf = ABS(x[i]);
+            }
+            if (ABS(x[i + inc_x]) > maxf) {
+                max = j + 1;
+                maxf = ABS(x[i + inc_x]);
+            }
+            if (ABS(x[i + 2 * inc_x]) > maxf) {
+                max = j + 2;
+                maxf = ABS(x[i + 2 * inc_x]);
+            }
+            if (ABS(x[i + 3 * inc_x]) > maxf) {
+                max = j + 3;
+                maxf = ABS(x[i + 3 * inc_x]);
+            }
+
+            i += inc_x * 4;
+
+            j += 4;
+
+        }
+
+
+        while (j < n) {
+            if (ABS(x[i]) > maxf) {
+                max = j;
+                maxf = ABS(x[i]);
+            }
+            i += inc_x;
+            j++;
+        }
+        return (max + 1);
+    }
+}
diff --git a/kernel/power/isamin.c b/kernel/power/isamin.c
index 1c1f0ad78..60c843f58 100644
--- a/kernel/power/isamin.c
+++ b/kernel/power/isamin.c
@@ -1,288 +1,288 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#include "common.h"
-#include <math.h>
-#include <altivec.h>
-#if defined(DOUBLE)
-    #define ABS fabs
-#else
-    #define ABS fabsf
-#endif
-/**
- * Find  minimum index 
- * Warning: requirements n>0  and n % 64 == 0
- * @param n     
- * @param x     pointer to the vector
- * @param minf  (out) minimum absolute value .( only for output )
- * @return  index 
- */
-static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) {
-    BLASLONG index;
-    BLASLONG i=0;
-    register __vector unsigned int static_index0 = {0,1,2,3};
-    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
-    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
-    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
-    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
-    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
-    temp0=vec_xor(temp0,temp0);
-    temp1=temp1 <<1 ; //{16,16,16,16}
-    register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3};
-    register __vector float * v_ptrx=(__vector float *)x;
-    register __vector float quadruple_values=vec_abs(v_ptrx[0]);
-    for(; i<n; i+=64){
-       //absolute temporary vectors
-       register __vector float v0=vec_abs(v_ptrx[0]);
-       register __vector float v1=vec_abs(v_ptrx[1]);
-       register __vector float v2=vec_abs(v_ptrx[2]);
-       register __vector float v3=vec_abs(v_ptrx[3]);
-       register __vector float v4=vec_abs(v_ptrx[4]);
-       register __vector float v5=vec_abs(v_ptrx[5]);
-       register __vector float v6=vec_abs(v_ptrx[6]);       
-       register __vector float v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       register __vector bool int r1=vec_cmpgt(v0,v1);
-       register __vector bool int r2=vec_cmpgt(v2,v3);
-       register __vector bool int r3=vec_cmpgt(v4,v5);
-       register __vector bool int r4=vec_cmpgt(v6,v7);
-              
-       //select
-       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
-       register __vector float vf0= vec_sel(v0,v1,r1);
-
-       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vf1= vec_sel(v2,v3,r2);
-
-       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-       r1=vec_cmpgt(vf0,vf1);
-       r2=vec_cmpgt(v0,v1);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_first= vec_sel(ind0_first,ind1,r1);
-       vf0= vec_sel(vf0,vf1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vf1= vec_sel(v0,v1,r2);
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the first 32 values
-       r1=vec_cmpgt(vf0,vf1);
-       ind0_first = vec_sel(ind0_first,ind2,r1);
-       vf0= vec_sel(vf0,vf1,r1);
- 
-       ind0_first+=temp0; //get absolute index
-       
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
-       //second part of 32
-       // absolute temporary vectors
-       v0=vec_abs(v_ptrx[0]);
-       v1=vec_abs(v_ptrx[1]);
-       v2=vec_abs(v_ptrx[2]);
-       v3=vec_abs(v_ptrx[3]);
-       v4=vec_abs(v_ptrx[4]);
-       v5=vec_abs(v_ptrx[5]);
-       v6=vec_abs(v_ptrx[6]);       
-       v7=vec_abs(v_ptrx[7]);
-       //cmp quadruple pairs
-       r1=vec_cmpgt(v0,v1);
-       r2=vec_cmpgt(v2,v3);
-       r3=vec_cmpgt(v4,v5);
-       r4=vec_cmpgt(v6,v7);
-       //select
-       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
-       register __vector float vv0= vec_sel(v0,v1,r1);
-
-       ind1= vec_sel(static_index2,static_index3,r2);
-       register __vector float vv1= vec_sel(v2,v3,r2);
-
-       ind2= vec_sel(static_index0,static_index1,r3);
-       v0=vec_sel(v4,v5,r3);
-
-       ind3= vec_sel(static_index2,static_index3,r4);
-       v1=vec_sel(v6,v7,r4);
-
-       // cmp selected
-       r1=vec_cmpgt(vv0,vv1);
-       r2=vec_cmpgt(v0,v1);
-
-       v_ptrx+=8;
-       //select from above 
-       ind0_second= vec_sel(ind0_second,ind1,r1);
-       vv0= vec_sel(vv0,vv1,r1) ;
-
-       ind2= vec_sel(ind2,ind3,r2);
-       vv1= vec_sel(v0,v1,r2) ;  
-
-       //second indices actually should be within [16,31] so ind2+16
-       ind2 +=temp1;
-       
-       //final cmp and select index and value for the second 32 values
-       r1=vec_cmpgt(vv0,vv1);
-       ind0_second = vec_sel(ind0_second,ind2,r1);
-       vv0= vec_sel(vv0,vv1,r1);
-
-       ind0_second+=temp0; //get absolute index
-        
-       //find final quadruple from 64 elements
-       r2=vec_cmpgt(vf0,vv0);
-       ind2 = vec_sel( ind0_first,ind0_second,r2);
-       vv0= vec_sel(vf0,vv0,r2);       
-             
-       //compare with old quadruple and update 
-       r3=vec_cmpgt( quadruple_values,vv0);
-       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
-       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
-            
-       temp0+=temp1;
-       temp0+=temp1; //temp0+32
-       
-      
-    }
-
-    //now we have to chose from 4 values and 4 different indices
-    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
-    // otherwise we will assign index of the minimum value
-    float a1,a2,a3,a4;
-    unsigned int i1,i2,i3,i4;
-    a1=vec_extract(quadruple_values,0);
-    a2=vec_extract(quadruple_values,1);
-    a3=vec_extract(quadruple_values,2);
-    a4=vec_extract(quadruple_values,3);
-    i1=vec_extract(quadruple_indices,0);
-    i2=vec_extract(quadruple_indices,1);
-    i3=vec_extract(quadruple_indices,2);
-    i4=vec_extract(quadruple_indices,3);
-    if(a1==a2){
-       index=i1>i2?i2:i1;
-    }else if(a2<a1){
-      index=i2;
-      a1=a2;
-    }else{
-       index= i1;
-    }
-
-    if(a4==a3){
-      i1=i3>i4?i4:i3;
-    }else if(a4<a3){
-      i1=i4;
-      a3=a4;
-    }else{
-       i1= i3;
-    }
-
-    if(a1==a3){
-      index=i1>index?index:i1;
-       *minf=a1; 
-    }else if(a3<a1){
-       index=i1;
-       *minf=a3;
-    }else{ 
-        *minf=a1;
-    }
-    return index;
-
-}
-
-
-
-
-BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
-    BLASLONG i = 0;
-    BLASLONG j = 0; 
-    BLASLONG min = 0;
-    FLOAT minf = 0.0;
-    
-    if (n <= 0 || inc_x <= 0) return (min);
-    minf = ABS(x[0]); //index's not incremented
-    if (inc_x == 1) {
-
-        BLASLONG n1 = n & -64;
-        if (n1 > 0) {
-
-            min = siamin_kernel_64(n1, x, &minf);
-            i = n1;
-        }
-
-        while (i < n) {
-            if (ABS(x[i]) < minf) {
-                min = i;
-                minf = ABS(x[i]);
-            }
-            i++;
-        }
-        return (min + 1);
-
-    } else {
-
-        BLASLONG n1 = n & -4;
-        while (j < n1) {
-
-            if (ABS(x[i]) < minf) {
-                min = j;
-                minf = ABS(x[i]);
-            }
-            if (ABS(x[i + inc_x]) < minf) {
-                min = j + 1;
-                minf = ABS(x[i + inc_x]);
-            }
-            if (ABS(x[i + 2 * inc_x]) < minf) {
-                min = j + 2;
-                minf = ABS(x[i + 2 * inc_x]);
-            }
-            if (ABS(x[i + 3 * inc_x]) < minf) {
-                min = j + 3;
-                minf = ABS(x[i + 3 * inc_x]);
-            }
-
-            i += inc_x * 4;
-
-            j += 4;
-
-        }
-
-
-        while (j < n) {
-            if (ABS(x[i]) < minf) {
-                min = j;
-                minf = ABS(x[i]);
-            }
-            i += inc_x;
-            j++;
-        }
-        return (min + 1);
-    }
-}
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#include "common.h"
+#include <math.h>
+#include <altivec.h>
+#if defined(DOUBLE)
+    #define ABS fabs
+#else
+    #define ABS fabsf
+#endif
+/**
+ * Find  minimum index 
+ * Warning: requirements n>0  and n % 64 == 0
+ * @param n     
+ * @param x     pointer to the vector
+ * @param minf  (out) minimum absolute value .( only for output )
+ * @return  index 
+ */
+static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) {
+    BLASLONG index;
+    BLASLONG i=0;
+    register __vector unsigned int static_index0 = {0,1,2,3};
+    register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register
+    register __vector unsigned int temp1=  temp0<<1;  //{8,8,8,8}
+    register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7};
+    register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11};
+    register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15};
+    temp0=vec_xor(temp0,temp0);
+    temp1=temp1 <<1 ; //{16,16,16,16}
+    register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3};
+    register __vector float * v_ptrx=(__vector float *)x;
+    register __vector float quadruple_values=vec_abs(v_ptrx[0]);
+    for(; i<n; i+=64){
+       //absolute temporary vectors
+       register __vector float v0=vec_abs(v_ptrx[0]);
+       register __vector float v1=vec_abs(v_ptrx[1]);
+       register __vector float v2=vec_abs(v_ptrx[2]);
+       register __vector float v3=vec_abs(v_ptrx[3]);
+       register __vector float v4=vec_abs(v_ptrx[4]);
+       register __vector float v5=vec_abs(v_ptrx[5]);
+       register __vector float v6=vec_abs(v_ptrx[6]);       
+       register __vector float v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       register __vector bool int r1=vec_cmpgt(v0,v1);
+       register __vector bool int r2=vec_cmpgt(v2,v3);
+       register __vector bool int r3=vec_cmpgt(v4,v5);
+       register __vector bool int r4=vec_cmpgt(v6,v7);
+              
+       //select
+       register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1);
+       register __vector float vf0= vec_sel(v0,v1,r1);
+
+       register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vf1= vec_sel(v2,v3,r2);
+
+       register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vf0,vf1);
+       r2=vec_cmpgt(v0,v1);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_first= vec_sel(ind0_first,ind1,r1);
+       vf0= vec_sel(vf0,vf1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vf1= vec_sel(v0,v1,r2);
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the first 32 values
+       r1=vec_cmpgt(vf0,vf1);
+       ind0_first = vec_sel(ind0_first,ind2,r1);
+       vf0= vec_sel(vf0,vf1,r1);
+ 
+       ind0_first+=temp0; //get absolute index
+       
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       //second part of 32
+       // absolute temporary vectors
+       v0=vec_abs(v_ptrx[0]);
+       v1=vec_abs(v_ptrx[1]);
+       v2=vec_abs(v_ptrx[2]);
+       v3=vec_abs(v_ptrx[3]);
+       v4=vec_abs(v_ptrx[4]);
+       v5=vec_abs(v_ptrx[5]);
+       v6=vec_abs(v_ptrx[6]);       
+       v7=vec_abs(v_ptrx[7]);
+       //cmp quadruple pairs
+       r1=vec_cmpgt(v0,v1);
+       r2=vec_cmpgt(v2,v3);
+       r3=vec_cmpgt(v4,v5);
+       r4=vec_cmpgt(v6,v7);
+       //select
+       register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1);
+       register __vector float vv0= vec_sel(v0,v1,r1);
+
+       ind1= vec_sel(static_index2,static_index3,r2);
+       register __vector float vv1= vec_sel(v2,v3,r2);
+
+       ind2= vec_sel(static_index0,static_index1,r3);
+       v0=vec_sel(v4,v5,r3);
+
+       ind3= vec_sel(static_index2,static_index3,r4);
+       v1=vec_sel(v6,v7,r4);
+
+       // cmp selected
+       r1=vec_cmpgt(vv0,vv1);
+       r2=vec_cmpgt(v0,v1);
+
+       v_ptrx+=8;
+       //select from above 
+       ind0_second= vec_sel(ind0_second,ind1,r1);
+       vv0= vec_sel(vv0,vv1,r1) ;
+
+       ind2= vec_sel(ind2,ind3,r2);
+       vv1= vec_sel(v0,v1,r2) ;  
+
+       //second indices actually should be within [16,31] so ind2+16
+       ind2 +=temp1;
+       
+       //final cmp and select index and value for the second 32 values
+       r1=vec_cmpgt(vv0,vv1);
+       ind0_second = vec_sel(ind0_second,ind2,r1);
+       vv0= vec_sel(vv0,vv1,r1);
+
+       ind0_second+=temp0; //get absolute index
+        
+       //find final quadruple from 64 elements
+       r2=vec_cmpgt(vf0,vv0);
+       ind2 = vec_sel( ind0_first,ind0_second,r2);
+       vv0= vec_sel(vf0,vv0,r2);       
+             
+       //compare with old quadruple and update 
+       r3=vec_cmpgt( quadruple_values,vv0);
+       quadruple_indices = vec_sel( quadruple_indices,ind2,r3);
+       quadruple_values= vec_sel(quadruple_values,vv0,r3);      
+            
+       temp0+=temp1;
+       temp0+=temp1; //temp0+32
+       
+      
+    }
+
+    //now we have to chose from 4 values and 4 different indices
+    // we will compare pairwise if pairs are exactly the same we will choose minimum between index
+    // otherwise we will assign index of the minimum value
+    float a1,a2,a3,a4;
+    unsigned int i1,i2,i3,i4;
+    a1=vec_extract(quadruple_values,0);
+    a2=vec_extract(quadruple_values,1);
+    a3=vec_extract(quadruple_values,2);
+    a4=vec_extract(quadruple_values,3);
+    i1=vec_extract(quadruple_indices,0);
+    i2=vec_extract(quadruple_indices,1);
+    i3=vec_extract(quadruple_indices,2);
+    i4=vec_extract(quadruple_indices,3);
+    if(a1==a2){
+       index=i1>i2?i2:i1;
+    }else if(a2<a1){
+      index=i2;
+      a1=a2;
+    }else{
+       index= i1;
+    }
+
+    if(a4==a3){
+      i1=i3>i4?i4:i3;
+    }else if(a4<a3){
+      i1=i4;
+      a3=a4;
+    }else{
+       i1= i3;
+    }
+
+    if(a1==a3){
+      index=i1>index?index:i1;
+       *minf=a1; 
+    }else if(a3<a1){
+       index=i1;
+       *minf=a3;
+    }else{ 
+        *minf=a1;
+    }
+    return index;
+
+}
+
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
+    BLASLONG i = 0;
+    BLASLONG j = 0; 
+    BLASLONG min = 0;
+    FLOAT minf = 0.0;
+    
+    if (n <= 0 || inc_x <= 0) return (min);
+    minf = ABS(x[0]); //index's not incremented
+    if (inc_x == 1) {
+
+        BLASLONG n1 = n & -64;
+        if (n1 > 0) {
+
+            min = siamin_kernel_64(n1, x, &minf);
+            i = n1;
+        }
+
+        while (i < n) {
+            if (ABS(x[i]) < minf) {
+                min = i;
+                minf = ABS(x[i]);
+            }
+            i++;
+        }
+        return (min + 1);
+
+    } else {
+
+        BLASLONG n1 = n & -4;
+        while (j < n1) {
+
+            if (ABS(x[i]) < minf) {
+                min = j;
+                minf = ABS(x[i]);
+            }
+            if (ABS(x[i + inc_x]) < minf) {
+                min = j + 1;
+                minf = ABS(x[i + inc_x]);
+            }
+            if (ABS(x[i + 2 * inc_x]) < minf) {
+                min = j + 2;
+                minf = ABS(x[i + 2 * inc_x]);
+            }
+            if (ABS(x[i + 3 * inc_x]) < minf) {
+                min = j + 3;
+                minf = ABS(x[i + 3 * inc_x]);
+            }
+
+            i += inc_x * 4;
+
+            j += 4;
+
+        }
+
+
+        while (j < n) {
+            if (ABS(x[i]) < minf) {
+                min = j;
+                minf = ABS(x[i]);
+            }
+            i += inc_x;
+            j++;
+        }
+        return (min + 1);
+    }
+}
diff --git a/kernel/power/sgemm_kernel_power9.S b/kernel/power/sgemm_kernel_power9.S
index 7a0f3143e..5cdc83d87 100644
--- a/kernel/power/sgemm_kernel_power9.S
+++ b/kernel/power/sgemm_kernel_power9.S
@@ -1,272 +1,272 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
- 
-#define LOAD	ld
-#define STACKSIZE  (512 )  
-#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
-#define	M	r3
-#define	N	r4
-#define	K	r5
-
- 
-#define A	r7
-#define	B	r8
-#define	C	r9
-#define	LDC	r10
-#define OFFSET	r6
- 
- 
-
-#define alpha_r vs20
-#define save_permute_1 vs21
-#define save_permute_2 vs22
-#define permute_mask vs23
-#define o0	0
- 
-
-#define T1	r11
-#define T2	r12
-#define T3	r14
-#define T4	r15
-#define T5	r16
-#define T6	r17
-#define L	r18
-#define T7	r19
-#define T8	r20
-#define TEMP_REG	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO 	r26
-#define T9	r27
-#define	T10	r28
-#define	T11	r29
-
-#define T12	r30
-#define T13	r31
-
-#include "sgemm_macros_power9.S"
-
-.equ    perm_const1, 0x0405060700010203
-.equ    perm_const2, 0x0c0d0e0f08090a0b
-.equ save_permute_11, 0x1415161718191a1b
-.equ save_permute_12, 0x0405060708090a0b
-.equ save_permute_21, 0x101112131c1d1e1f
-.equ save_permute_22, 0x000102030c0d0e0f 
-
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-	addi	SP, SP, -STACKSIZE
-	mflr r0
-
-
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
- 
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-  stxv    vs52,  288(SP)
-  stxv    vs53,  304(SP)
-  stxv    vs54,  320(SP)
-  stxv    vs55,  336(SP)
-  stxv    vs56,  352(SP)
-  stxv    vs57,  368(SP)
-  stxv    vs58,  384(SP)
-  stxv    vs59,  400(SP)
-  stxv    vs60,  416(SP)
-  stxv    vs61,  432(SP)
-  stxv    vs62,  448(SP)
-  stxv    vs63,  464(SP)
-  std     r0,   FLINK_SAVE(SP)
- 
-
-#if defined(TRMMKERNEL) 
-	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
-#endif
-   slwi    LDC, LDC, 2
-
- 
- 
-	/*alpha is stored in f1. convert to single and splat*/
-  xscvdpspn alpha_r,vs1 
-	xxspltw   alpha_r,alpha_r,0 
- 
-/*load reverse permute mask for big endian
-  uint128 = 0xc0d0e0f08090a0b0405060700010203
-*/ 
-		
-	lis T2, perm_const2@highest
-	lis T1, perm_const1@highest
-	lis T3, save_permute_12@highest
-	lis T4, save_permute_11@highest
-	lis T5, save_permute_22@highest
-	lis T6, save_permute_21@highest
-	ori T2, T2, perm_const2@higher
-	ori T1, T1, perm_const1@higher
-	ori T3, T3, save_permute_12@higher
-	ori T4, T4, save_permute_11@higher
-	ori T5, T5, save_permute_22@higher
-	ori T6, T6, save_permute_21@higher
-	rldicr T2, T2, 32, 31
-	rldicr T1, T1, 32, 31
-	rldicr T3, T3, 32, 31
-	rldicr T4, T4, 32, 31
-	rldicr T5, T5, 32, 31
-	rldicr T6, T6, 32, 31
-	oris T2, T2, perm_const2@h
-	oris T1, T1, perm_const1@h
-	oris T3, T3, save_permute_12@h
-	oris T4, T4, save_permute_11@h
-	oris T5, T5, save_permute_22@h
-	oris T6, T6, save_permute_21@h
-	ori T2, T2, perm_const2@l  
-	ori T1, T1, perm_const1@l
-	ori T3, T3, save_permute_12@l  
-	ori T4, T4, save_permute_11@l
-	ori T5, T5, save_permute_22@l 
-	ori T6, T6, save_permute_21@l
-  li r0,0
-	mtvsrdd permute_mask,T2,T1
-	mtvsrdd save_permute_1,T3,T4	
-	mtvsrdd save_permute_2,T5,T6	
-
-#include "sgemm_logic_power9.S"
-
-.L999: 
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
-
-	ld    r0, 	 FLINK_SAVE(SP)	
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP)
-	mtlr r0
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE 
-	blr
-
-
-	EPILOGUE
-#endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+ 
+#define LOAD	ld
+#define STACKSIZE  (512 )  
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */  
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r7
+#define	B	r8
+#define	C	r9
+#define	LDC	r10
+#define OFFSET	r6
+ 
+ 
+
+#define alpha_r vs20
+#define save_permute_1 vs21
+#define save_permute_2 vs22
+#define permute_mask vs23
+#define o0	0
+ 
+
+#define T1	r11
+#define T2	r12
+#define T3	r14
+#define T4	r15
+#define T5	r16
+#define T6	r17
+#define L	r18
+#define T7	r19
+#define T8	r20
+#define TEMP_REG	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO 	r26
+#define T9	r27
+#define	T10	r28
+#define	T11	r29
+
+#define T12	r30
+#define T13	r31
+
+#include "sgemm_macros_power9.S"
+
+.equ    perm_const1, 0x0405060700010203
+.equ    perm_const2, 0x0c0d0e0f08090a0b
+.equ save_permute_11, 0x1415161718191a1b
+.equ save_permute_12, 0x0405060708090a0b
+.equ save_permute_21, 0x101112131c1d1e1f
+.equ save_permute_22, 0x000102030c0d0e0f 
+
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	addi	SP, SP, -STACKSIZE
+	mflr r0
+
+
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+  stxv    vs52,  288(SP)
+  stxv    vs53,  304(SP)
+  stxv    vs54,  320(SP)
+  stxv    vs55,  336(SP)
+  stxv    vs56,  352(SP)
+  stxv    vs57,  368(SP)
+  stxv    vs58,  384(SP)
+  stxv    vs59,  400(SP)
+  stxv    vs60,  416(SP)
+  stxv    vs61,  432(SP)
+  stxv    vs62,  448(SP)
+  stxv    vs63,  464(SP)
+  std     r0,   FLINK_SAVE(SP)
+ 
+
+#if defined(TRMMKERNEL) 
+	ld	OFFSET,  FRAMESLOT(0) + STACKSIZE(SP)
+#endif
+   slwi    LDC, LDC, 2
+
+ 
+ 
+	/*alpha is stored in f1. convert to single and splat*/
+  xscvdpspn alpha_r,vs1 
+	xxspltw   alpha_r,alpha_r,0 
+ 
+/*load reverse permute mask for big endian
+  uint128 = 0xc0d0e0f08090a0b0405060700010203
+*/ 
+		
+	lis T2, perm_const2@highest
+	lis T1, perm_const1@highest
+	lis T3, save_permute_12@highest
+	lis T4, save_permute_11@highest
+	lis T5, save_permute_22@highest
+	lis T6, save_permute_21@highest
+	ori T2, T2, perm_const2@higher
+	ori T1, T1, perm_const1@higher
+	ori T3, T3, save_permute_12@higher
+	ori T4, T4, save_permute_11@higher
+	ori T5, T5, save_permute_22@higher
+	ori T6, T6, save_permute_21@higher
+	rldicr T2, T2, 32, 31
+	rldicr T1, T1, 32, 31
+	rldicr T3, T3, 32, 31
+	rldicr T4, T4, 32, 31
+	rldicr T5, T5, 32, 31
+	rldicr T6, T6, 32, 31
+	oris T2, T2, perm_const2@h
+	oris T1, T1, perm_const1@h
+	oris T3, T3, save_permute_12@h
+	oris T4, T4, save_permute_11@h
+	oris T5, T5, save_permute_22@h
+	oris T6, T6, save_permute_21@h
+	ori T2, T2, perm_const2@l  
+	ori T1, T1, perm_const1@l
+	ori T3, T3, save_permute_12@l  
+	ori T4, T4, save_permute_11@l
+	ori T5, T5, save_permute_22@l 
+	ori T6, T6, save_permute_21@l
+  li r0,0
+	mtvsrdd permute_mask,T2,T1
+	mtvsrdd save_permute_1,T3,T4	
+	mtvsrdd save_permute_2,T5,T6	
+
+#include "sgemm_logic_power9.S"
+
+.L999: 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+
+	EPILOGUE
+#endif
diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
index a34ed32b8..4022959e2 100644
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
@@ -1,2192 +1,2192 @@
-#define MY_ALIGN .align 3
-b L8
-
-	MY_ALIGN
-LSGEMM_L8x16_LMAIN_SUB: 
-	LOAD8x16_2    
-	MY_ALIGN
-
-LSGEMM_L8x16_LOOP:
-    KERNEL8x16_L2 128,64,0,0 
-LSGEMM_L8x16_K128:
-    KERNEL8x16_L2 128,64,1,0 
-    KERNEL8x16_I1_L4_2  128,64, 1,0
-    KERNEL8x16_I1_L4_2  128,64, 2,0
-    KERNEL8x16_I1_L4_2  128,64, 3,0
-    KERNEL8x16_I1_L4_2  128,64, 4,0
-    KERNEL8x16_I1_L4_2  128,64, 5,0        
-    KERNEL8x16_I1_L4_2  128,64, 6,0
-    KERNEL8x16_I1_L4_2  128,64, 7,0  
-    KERNEL8x16_I1_L4_2  128,64, 8,0      
-    KERNEL8x16_I1_L4_2  128,64, 9,0
-    KERNEL8x16_I1_L4_2  128,64, 10,0
-    KERNEL8x16_I1_L4_2  128,64, 11,0
-    KERNEL8x16_I1_L4_2  128,64, 12,0
-    KERNEL8x16_I1_L4_2  128,64, 13,0    
-    KERNEL8x16_I1_L4_2  128,64, 14,0    
-    KERNEL8x16_I1_L4_2  128,64, 15,0  	
-    KERNEL8x16_I1_L4_2  128,64, 16,0
-    KERNEL8x16_I1_L4_2  128,64, 17,0
-    KERNEL8x16_I1_L4_2  128,64, 18,0
-    KERNEL8x16_I1_L4_2  128,64, 19,0
-    KERNEL8x16_I1_L4_2  128,64, 20,0
-    KERNEL8x16_I1_L4_2  128,64, 21,0        
-    KERNEL8x16_I1_L4_2  128,64, 22,0
-    KERNEL8x16_I1_L4_2  128,64, 23,0  
-    KERNEL8x16_I1_L4_2  128,64, 24,0      
-    KERNEL8x16_I1_L4_2  128,64, 25,0
-    KERNEL8x16_I1_L4_2  128,64, 26,0
-    KERNEL8x16_I1_L4_2  128,64, 27,0
-    KERNEL8x16_I1_L4_2  128,64, 28,0
-    KERNEL8x16_I1_L4_2  128,64, 29,0    
-    KERNEL8x16_I1_L4_2  128,64, 30,0    
-    KERNEL8x16_I1_L4_2  128,64, 31,1 
-	bdnz		LSGEMM_L8x16_LOOP
-
-	MY_ALIGN
-LSGEMM_L8x16_LOOP_END: 
-    END8x16_2
-    blr  
-
-	MY_ALIGN
-LSGEMM_L8x16_L64_SUB: 
-	LOAD8x16_2     
-    KERNEL8x16_I1_L4_2  128,64, 0,0
-    KERNEL8x16_I1_L4_2  128,64, 1,0
-    KERNEL8x16_I1_L4_2  128,64, 2,0
-    KERNEL8x16_I1_L4_2  128,64,3,0
-    KERNEL8x16_I1_L4_2  128,64,4,0
-    KERNEL8x16_I1_L4_2  128,64,5,0        
-    KERNEL8x16_I1_L4_2  128,64,6,0
-    KERNEL8x16_I1_L4_2  128,64,7,0  
-    KERNEL8x16_I1_L4_2  128,64,8,0      
-    KERNEL8x16_I1_L4_2  128,64,9,0
-    KERNEL8x16_I1_L4_2  128,64,10,0
-    KERNEL8x16_I1_L4_2  128,64,11,0
-    KERNEL8x16_I1_L4_2  128,64,12,0
-    KERNEL8x16_I1_L4_2  128,64,13,0    
-    KERNEL8x16_I1_L4_2  128,64,14,0    
-    KERNEL8x16_I1_L4_3  128,64,15,1 
-    blr	
-LSGEMM_L8x16_L32_SUB: 
-	LOAD8x16_2     
-    KERNEL8x16_I1_L4_2  128,64,0,0
-    KERNEL8x16_I1_L4_2  128,64,1,0
-    KERNEL8x16_I1_L4_2  128,64,2,0
-    KERNEL8x16_I1_L4_2  128,64,3,0
-    KERNEL8x16_I1_L4_2  128,64,4,0
-    KERNEL8x16_I1_L4_2  128,64,5,0        
-    KERNEL8x16_I1_L4_2  128,64,6,0
-    KERNEL8x16_I1_L4_3  128,64,7,1
-    blr	
-
-LSGEMM_L8x16_L16_SUB: 
-	LOAD8x16_2     
-    KERNEL8x16_I1_L4_2  128,64,0,0
-    KERNEL8x16_I1_L4_2  128,64,1,0
-    KERNEL8x16_I1_L4_2  128,64,2,0
-    KERNEL8x16_I1_L4_3  128,64,3,1
-    blr	
-
-L8:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-   neg TEMP_REG, OFFSET 
-#endif
-
-	srawi.		J,	N,	3
-
-	ble		LSGEMM_L8_END
-
-LSGEMM_L8_BEGIN:
-
-	li		T1,	128
-	li		T2,	256
- 
-	mr		AO,	A
-	mr		CO,	C
-	slwi		T3,	LDC	,	3
-	add		C,	C,	T3
-
-	dcbt		A,	T1
-	dcbt		A,	T2
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	srawi.		I,	M,	4
-	ble		LSGEMM_L8x16_END
-
-	MY_ALIGN
-LSGEMM_L8x16_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,8
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
-   mr T12, T11
-   addi T12,T12, -2
-   srawi.		L, T12,	7 /**(T11-2) % 128x */
-#else
-   mr T12, K
-   addi T12,T12, -2
-   srawi.		L,	T12,	7 /**(K-2) % 128x */
-#endif 
- 
-    ZERO8x16 
-	ble		LSGEMM_L8x16_SUB0
-	mtctr		L 
-    bl      LSGEMM_L8x16_LMAIN_SUB
-	andi.		L,	T12,	127
-	ble		LSGEMM_L8x16_SAVE
-	b		LSGEMM_L8x16_SUB2   
-	MY_ALIGN
-LSGEMM_L8x16_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	255
-    cmpwi   T11,129
-#else
-	andi.		L,	K,	255
-    cmpwi   K,129
-#endif       
-    li T10,1
-    bne CMP8x16_128K
-    addi BO,BO,-32
-    addi AO,AO,-64 
-    LOAD8x16 64,32 
-    END8x16_WITHOUT_ADD   
-    LOAD8x16_2O AO,BO,  128, 64 
-    mtctr   T10   
-    bl LSGEMM_L8x16_K128   
-    b LSGEMM_L8x16_SAVE  
-CMP8x16_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T11,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne LSGEMM_L8x16_SUB2 
-    MY_ALIGN   
-    mtctr   T10
-    addi BO,BO,-64
-    addi AO,AO,-128   
-    LOAD8x16_2O  AO,BO,  128,64
-    bl LSGEMM_L8x16_K128   
-    b LSGEMM_L8x16_SAVE
-	MY_ALIGN
-LSGEMM_L8x16_SUB2:
-    andi.   T10,L,64
-    ble   LSGEMM_L8x16_SUB2_32
-    bl   LSGEMM_L8x16_L64_SUB
-    MY_ALIGN 
-LSGEMM_L8x16_SUB2_32:
-    andi.      T10,L, 32
-    ble LSGEMM_L8x16_SUB2_16
-    bl   LSGEMM_L8x16_L32_SUB
-    MY_ALIGN                
-LSGEMM_L8x16_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L8x16_SUB2_8
-	bl  LSGEMM_L8x16_L16_SUB
-    MY_ALIGN 
-LSGEMM_L8x16_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L8x16_SUB2_4 
-	LOAD8x16_2
-    KERNEL8x16_I1_L4_2  128,64, 0,0
-    KERNEL8x16_I1_L4_3  128,64, 1,1
-	MY_ALIGN	
-LSGEMM_L8x16_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L8x16_SUB2_2
-    LOAD8x16_2
-    KERNEL8x16_I1_L4_3  128,64, 0,1
-    MY_ALIGN
-LSGEMM_L8x16_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L8x16_SUB2_1
-    LOAD8x16_2
-    KERNEL8x16_E2  128,64, 0,1
-    MY_ALIGN    
-LSGEMM_L8x16_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L8x16_SAVE	
-    KERNEL8x16 0
-
-
-	MY_ALIGN
-LSGEMM_L8x16_SAVE:
-	SAVE8x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8
-#endif	
-	addic.		I,	I,	-1
-	bgt+		LSGEMM_L8x16_BEGIN
-    MY_ALIGN
-LSGEMM_L8x16_END:
-LSGEMM_L8x8_BEGIN:
-    andi.       T2, M,  15
-    ble     LSGEMM_L8x1_END
-
-    andi.       T1, M,  8
-    ble     LSGEMM_L8x8_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,8
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,8,8
-   mr T12, T11
-   addi T12,T12, -1
-   srawi.       L, T12, 4 /**(T11-1) % 16x */
-#else
-   mr T12, K
-   addi T12,T12, -1
-   srawi.       L,  T12,    4 /**(K-1) % 16x */
-#endif 
-    
-    ZERO8x8
-    ble     LSGEMM_L8x8_SUB0
-
-    MY_ALIGN
-LSGEMM_L8x8_LOOP_START:
- 
-    LOAD8x8_0  /*we already zeroed */ 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L8x8_LOOP:
-
-    KERNEL8x8_I1_L4_2  32,32, 0,0
-    KERNEL8x8_I1_L4_2  32,32, 1,0
-    KERNEL8x8_I1_L4_2  32,32, 2,0
-    KERNEL8x8_I1_L4_2  32,32, 3,1    
-
-    bdnz        LSGEMM_L8x8_LOOP
-
-    MY_ALIGN
-LSGEMM_L8x8_LOOP_END:
-
-    END8x8 0, AO, BO, 32, 32    
-
-    b       LSGEMM_L8x8_SUB1 
-    MY_ALIGN
-LSGEMM_L8x8_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    31
-#else
-    andi.       L,  K,  31
-#endif   
-    b       LSGEMM_L8x8_SUB2
-    MY_ALIGN
-LSGEMM_L8x8_SUB1:
-#if defined(TRMMKERNEL)
-    andi.       L,  T12,    15
-#else
-    andi.       L,  T12,    15
-#endif  
-    ble     LSGEMM_L8x8_SAVE
-    MY_ALIGN
-LSGEMM_L8x8_SUB2:
- 
-    srawi.      T1,L, 3
-    ble LSGEMM_L8x8_SUB2_4 
-    mtctr		T1
-    MY_ALIGN
-LSGEMM_L8x8_SUB2_LOOP:    
-    LOAD8x8_0
-    KERNEL8x8_I1_L4_2  32,32, 0,0
-    KERNEL8x8_I1_L4_3  32,32, 1,1
-    bdnz LSGEMM_L8x8_SUB2_LOOP
-    MY_ALIGN    
-LSGEMM_L8x8_SUB2_4:
-    andi.      T1,L, 4
-    ble LSGEMM_L8x8_SUB2_2
-    LOAD8x8_0
-    KERNEL8x8_I1_L4_3  32,32, 0,1
-    MY_ALIGN
-LSGEMM_L8x8_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L8x8_SUB2_1
-    LOAD8x8_0
-    KERNEL8x8_I1_L2_3  32,32, 0,1
-    MY_ALIGN    
-LSGEMM_L8x8_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L8x8_SAVE   
-    KERNEL8x8 0
- 
-
-    MY_ALIGN
-LSGEMM_L8x8_SAVE:
-    SAVE8x8
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8
-#endif  
-    MY_ALIGN  
-LSGEMM_L8x8_END:
-LSGEMM_L8x4_BEGIN:
-    andi.       T2, M,  15
-    ble     LSGEMM_L8x1_END
-
-    andi.       T1, M,  4
-    ble     LSGEMM_L8x4_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,8
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,4,8
-   mr T12, T11
-   addi T12,T12, -1
-   srawi.       L, T12, 4 /**(T11-1) % 16x */
-#else
-   mr T12, K
-   addi T12,T12, -1
-   srawi.       L,  T12,    4 /**(K-1) % 16x */
-#endif 
-    
-    ZERO8x4
-    ble     LSGEMM_L8x4_SUB0
-
-    MY_ALIGN
-LSGEMM_L8x4_LOOP_START:
- 
-    LOAD8x4_0  /*we already zeroed */ 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L8x4_LOOP:
-
-    KERNEL8x4_I1_L4_2  16,32, 0,0
-    KERNEL8x4_I1_L4_2  16,32, 1,0
-    KERNEL8x4_I1_L4_2  16,32, 2,0
-    KERNEL8x4_I1_L4_2  16,32, 3,1    
-
-    bdnz        LSGEMM_L8x4_LOOP
-
-    MY_ALIGN
-LSGEMM_L8x4_LOOP_END:
-
-    END8x4 0, AO, BO, 16, 32    
-
-    b       LSGEMM_L8x4_SUB1 
-    MY_ALIGN
-LSGEMM_L8x4_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    31
-#else
-    andi.       L,  K,  31
-#endif   
-    b       LSGEMM_L8x4_SUB2
-    MY_ALIGN
-LSGEMM_L8x4_SUB1:
-#if defined(TRMMKERNEL)
-    andi.       L,  T12,    15
-#else
-    andi.       L,  T12,    15
-#endif  
-    ble     LSGEMM_L8x4_SAVE
-    MY_ALIGN
-LSGEMM_L8x4_SUB2:
-
-    srawi.      T1,L, 3
-    ble LSGEMM_L8x4_SUB2_4 
-    mtctr		T1
-    MY_ALIGN
-LSGEMM_L8x4_SUB2_LOOP:      
-    LOAD8x4_0
-    KERNEL8x4_I1_L4_2  16,32, 0,0
-    KERNEL8x4_I1_L4_3  16,32, 1,1
-    bdnz LSGEMM_L8x4_SUB2_LOOP
-    MY_ALIGN    
-LSGEMM_L8x4_SUB2_4:
-    andi.      T1,L, 4
-    ble LSGEMM_L8x4_SUB2_2
-    LOAD8x4_0
-    KERNEL8x4_I1_L4_3  16,32, 0,1
-    MY_ALIGN
-LSGEMM_L8x4_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L8x4_SUB2_1
-    LOAD8x4_0
-    KERNEL8x4_I1_L2_3  16,32, 0,1
-    MY_ALIGN    
-LSGEMM_L8x4_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L8x4_SAVE   
-    KERNEL8x4 0
- 
-
-    MY_ALIGN
-LSGEMM_L8x4_SAVE:
-    SAVE8x4
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8
-#endif  
-    MY_ALIGN  
-LSGEMM_L8x4_END:
-LSGEMM_L8x2_BEGIN:
-    andi.       T1, M,  2
-    ble     LSGEMM_L8x2_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,8
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 
-   srawi.       L, T11, 3 /**(T11) % 8x */
-#else
-   srawi.       L,  K,    3 /**(K) % 8x */
-#endif 
-    
-    ZERO8x2
-    ble     LSGEMM_L8x2_SUB0
-
-    MY_ALIGN
-LSGEMM_L8x2_LOOP_START: 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L8x2_LOOP:
-
-    KERNEL8x2_2  0,0, 0,0
-    KERNEL8x2_2  0,0, 1,0
-    KERNEL8x2_2  0,0, 2,0
-    KERNEL8x2_2  0,0, 3,1    
-
-    bdnz        LSGEMM_L8x2_LOOP
-
-    MY_ALIGN
-LSGEMM_L8x2_LOOP_END:   
- 
-LSGEMM_L8x2_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    7
-#else
-    andi.       L,  K,  7
-#endif    
-    ble     LSGEMM_L8x2_SAVE
-    MY_ALIGN
-LSGEMM_L8x2_SUB2:
-    andi.      T1,L, 4
-    ble LSGEMM_L8x2_SUB2_2
-    KERNEL8x2_2  0,0, 0,0
-    KERNEL8x2_2  0,0, 1,1
-    MY_ALIGN
-LSGEMM_L8x2_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L8x2_SUB2_1
-    KERNEL8x2_2  0,0, 0,1 
-    MY_ALIGN    
-LSGEMM_L8x2_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L8x2_SAVE   
-    KERNEL8x2
-  
-    MY_ALIGN
-LSGEMM_L8x2_SAVE:
-    SAVE8x2
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8
-#endif  
-    MY_ALIGN  
-LSGEMM_L8x2_END:
-LSGEMM_L8x1_BEGIN: 
-    andi.       T1, M,  1
-    ble     LSGEMM_L8x1_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,8
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 
-   srawi.       L, T11, 3 /**(T11) % 8x */
-#else
-   srawi.       L,  K,    3 /**(K) % 8x */
-#endif 
-    
-    ZERO8x1
-    ble     LSGEMM_L8x1_SUB0
-
-    MY_ALIGN
-LSGEMM_L8x1_LOOP_START: 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L8x1_LOOP:
-
-    KERNEL8x1_4  0,0, 0,0
-    KERNEL8x1_4  0,0, 1,1     
-
-    bdnz        LSGEMM_L8x1_LOOP
-
-    MY_ALIGN
-LSGEMM_L8x1_LOOP_END:   
- 
-LSGEMM_L8x1_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    7
-#else
-    andi.       L,  K,  7
-#endif    
-    ble     LSGEMM_L8x1_SAVE
-    MY_ALIGN
-LSGEMM_L8x1_SUB2:
-    andi.      T1,L, 4
-    ble LSGEMM_L8x1_SUB2_2
-    KERNEL8x1_4  0,0, 0,1 
-    MY_ALIGN
-LSGEMM_L8x1_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L8x1_SUB2_1
-    KERNEL8x1_2 
-    MY_ALIGN    
-LSGEMM_L8x1_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L8x1_SAVE   
-    KERNEL8x1
-  
-    MY_ALIGN
-LSGEMM_L8x1_SAVE:
-    SAVE8x1
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8
-#endif  
-    MY_ALIGN  
-LSGEMM_L8x1_END:
-
-	slwi		T1,	K,	5
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 8
-#endif
-	addic.		J,	J,	-1
-	bgt		LSGEMM_L8_BEGIN
- 
-
-LSGEMM_L8_END:
-
-/*	b		LSGEMM_L4_BEGIN*/
-    andi.       T1, N,  4
-    ble     LSGEMM_L4_END
-LSGEMM_L4_BEGIN:
-  
-
-	mr		AO,	A
-	mr		CO,	C
-	slwi		T3,	LDC	,	2
-	add		C,	C,	T3
- 
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	srawi.		I,	M,	4
-	ble		LSGEMM_L4x16_END
-
-	MY_ALIGN
-LSGEMM_L4x16_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,16,4
-   mr T12, T11
-   addi T12,T12, -1
-   srawi.		L, T12,	6 /**(T11-1) % 64x */
-#else
-   mr T12, K
-   addi T12,T12, -1
-   srawi.		L,	T12,	6 /**(K-1) % 64x */
-#endif 
- 
-    ZERO4x16
-	ble		LSGEMM_L4x16_SUB0
-
-	MY_ALIGN
-LSGEMM_L4x16_LOOP_START:
- 
-	LOAD4x16_0  /*we already zeroed */
-    ##OffsetA=64 OffsetB=16
-    addi AO,AO,2112
-    addi BO,BO,16  
-
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L4x16_LOOP:
-
-    KERNEL4x16_I1_L4_2  -2048,0, 0,0
-    KERNEL4x16_I1_L4_2  -2048,0, 1,0
-    KERNEL4x16_I1_L4_2  -2048,0, 2,0
-    KERNEL4x16_I1_L4_2  -2048,0, 3,0
-    KERNEL4x16_I1_L4_2  -2048,0, 4,0
-    KERNEL4x16_I1_L4_2  -2048,0, 5,0        
-    KERNEL4x16_I1_L4_2  -2048,0, 6,0
-    KERNEL4x16_I1_L4_2  -2048,0, 7,0  
-    KERNEL4x16_I1_L4_2  -2048,0, 8,0      
-    KERNEL4x16_I1_L4_2  -2048,0, 9,0
-    KERNEL4x16_I1_L4_2  -2048,0, 10,0
-    KERNEL4x16_I1_L4_2  -2048,0, 11,0
-    KERNEL4x16_I1_L4_2  -2048,0, 12,0
-    KERNEL4x16_I1_L4_2  -2048,0, 13,0    
-    KERNEL4x16_I1_L4_2  -2048,0, 14,0    
-    KERNEL4x16_I1_L4_2  -2048,0, 15,1  	
-
-	bdnz		LSGEMM_L4x16_LOOP
-
-	MY_ALIGN
-LSGEMM_L4x16_LOOP_END:
-
-    END4x16 0, AO, BO, -2048, 0    
-
-	b		LSGEMM_L4x16_SUB1 
-	MY_ALIGN
-LSGEMM_L4x16_SUB0:
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	127
-#else
-	andi.		L,	K,	127
-#endif   
-	b		LSGEMM_L4x16_SUB2
-	MY_ALIGN
-LSGEMM_L4x16_SUB1:
-#if defined(TRMMKERNEL)
-	andi.		L,	T12,	63
-#else
-	andi.		L,  T12,	63
-#endif	
-	ble		LSGEMM_L4x16_SAVE
-	MY_ALIGN
-LSGEMM_L4x16_SUB2:
-
-    srawi.      T10,L, 5
-    ble LSGEMM_L4x16_SUB2_16
-    mtctr		T10
-    MY_ALIGN
-LSGEMM_L4x16_SUB2_LOOP:
-	LOAD4x16_0 
-    KERNEL4x16_I1_L4_2  64,16, 0,0
-    KERNEL4x16_I1_L4_2  64,16, 1,0
-    KERNEL4x16_I1_L4_2  64,16, 2,0
-    KERNEL4x16_I1_L4_2  64,16, 3,0
-    KERNEL4x16_I1_L4_2  64,16, 4,0
-    KERNEL4x16_I1_L4_2  64,16, 5,0
-    KERNEL4x16_I1_L4_2  64,16, 6,0
-    KERNEL4x16_I1_L4_3  64,16, 7,1
-    bdnz LSGEMM_L4x16_SUB2_LOOP 
-    MY_ALIGN        
-LSGEMM_L4x16_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L4x16_SUB2_8
-	LOAD4x16_0 
-    KERNEL4x16_I1_L4_2  64,16, 0,0
-    KERNEL4x16_I1_L4_2  64,16, 1,0
-    KERNEL4x16_I1_L4_2  64,16, 2,0
-    KERNEL4x16_I1_L4_3  64,16, 3,1
-    MY_ALIGN 
-LSGEMM_L4x16_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L4x16_SUB2_4 
-	LOAD4x16_0
-    KERNEL4x16_I1_L4_2  64,16, 0,0
-    KERNEL4x16_I1_L4_3  64,16, 1,1
-	MY_ALIGN	
-LSGEMM_L4x16_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L4x16_SUB2_2
-    LOAD4x16_0
-    KERNEL4x16_I1_L4_3  64,16, 0,1
-    MY_ALIGN
-LSGEMM_L4x16_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L4x16_SUB2_1
-    LOAD4x16_0
-    KERNEL4x16_I1_L2_3  64,16, 0,1
-    MY_ALIGN    
-LSGEMM_L4x16_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L4x16_SAVE	
-    KERNEL4x16 0
-#	addic.		L,	L,	-1
-#	bgt		LSGEMM_L4x16_SUB2
-
-	MY_ALIGN
-LSGEMM_L4x16_SAVE:
-	SAVE4x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4
-#endif	
-	addic.		I,	I,	-1
-	bgt+		LSGEMM_L4x16_BEGIN
-    MY_ALIGN
-LSGEMM_L4x16_END:
-LSGEMM_L4x8_BEGIN:
-    andi.       T2, M,  15
-    ble     LSGEMM_L4x1_END
-
-    andi.       T1, M,  8
-    ble     LSGEMM_L4x8_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,8,4
-   mr T12, T11
-   addi T12,T12, -1
-   srawi.       L, T12, 4 /**(T11-1) % 16x */
-#else
-   mr T12, K
-   addi T12,T12, -1
-   srawi.       L,  T12,    4 /**(K-1) % 16x */
-#endif 
-    
-    ZERO4x8
-    ble     LSGEMM_L4x8_SUB0
-
-    MY_ALIGN
-LSGEMM_L4x8_LOOP_START:
- 
-    LOAD4x8_0  /*we already zeroed */ 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L4x8_LOOP:
-
-    KERNEL4x8_I1_L4_2  32,16, 0,0
-    KERNEL4x8_I1_L4_2  32,16, 1,0
-    KERNEL4x8_I1_L4_2  32,16, 2,0
-    KERNEL4x8_I1_L4_2  32,16, 3,1    
-
-    bdnz        LSGEMM_L4x8_LOOP
-
-    MY_ALIGN
-LSGEMM_L4x8_LOOP_END:
-
-    END4x8 0, AO, BO, 32, 16    
-
-    b       LSGEMM_L4x8_SUB1 
-    MY_ALIGN
-LSGEMM_L4x8_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    31
-#else
-    andi.       L,  K,  31
-#endif   
-    b       LSGEMM_L4x8_SUB2
-    MY_ALIGN
-LSGEMM_L4x8_SUB1:
-#if defined(TRMMKERNEL)
-    andi.       L,  T12,    15
-#else
-    andi.       L,  T12,    15
-#endif  
-    ble     LSGEMM_L4x8_SAVE
-    MY_ALIGN
-LSGEMM_L4x8_SUB2:
- 
-    srawi.      T1,L, 3
-    ble LSGEMM_L4x8_SUB2_4 
-    mtctr		T1
-    MY_ALIGN
-LSGEMM_L4x8_SUB2_LOOP:    
-    LOAD4x8_0
-    KERNEL4x8_I1_L4_2  32,16, 0,0
-    KERNEL4x8_I1_L4_3  32,16, 1,1
-    bdnz LSGEMM_L4x8_SUB2_LOOP
-    MY_ALIGN    
-LSGEMM_L4x8_SUB2_4:
-    andi.      T1,L, 4
-    ble LSGEMM_L4x8_SUB2_2
-    LOAD4x8_0
-    KERNEL4x8_I1_L4_3  32,16, 0,1
-    MY_ALIGN
-LSGEMM_L4x8_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L4x8_SUB2_1
-    LOAD4x8_0
-    KERNEL4x8_I1_L2_3  32,16, 0,1
-    MY_ALIGN    
-LSGEMM_L4x8_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L4x8_SAVE   
-    KERNEL4x8 0
- 
-
-    MY_ALIGN
-LSGEMM_L4x8_SAVE:
-    SAVE4x8
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4
-#endif  
-    MY_ALIGN  
-LSGEMM_L4x8_END:
-LSGEMM_L4x4_BEGIN:
-    andi.       T2, M,  15
-    ble     LSGEMM_L4x1_END
-
-    andi.       T1, M,  4
-    ble     LSGEMM_L4x4_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,4,4
-   mr T12, T11
-   addi T12,T12, -1
-   srawi.       L, T12, 4 /**(T11-1) % 16x */
-#else
-   mr T12, K
-   addi T12,T12, -1
-   srawi.       L,  T12,    4 /**(K-1) % 16x */
-#endif 
-    
-    ZERO4x4
-    ble     LSGEMM_L4x4_SUB0
-
-    MY_ALIGN
-LSGEMM_L4x4_LOOP_START:
- 
-    LOAD4x4_0  /*we already zeroed */ 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L4x4_LOOP:
-
-    KERNEL4x4_I1_L4_2  16,16, 0,0
-    KERNEL4x4_I1_L4_2  16,16, 1,0
-    KERNEL4x4_I1_L4_2  16,16, 2,0
-    KERNEL4x4_I1_L4_2  16,16, 3,1    
-
-    bdnz        LSGEMM_L4x4_LOOP
-
-    MY_ALIGN
-LSGEMM_L4x4_LOOP_END:
-
-    END4x4 0, AO, BO, 16, 16    
-
-    b       LSGEMM_L4x4_SUB1 
-    MY_ALIGN
-LSGEMM_L4x4_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    31
-#else
-    andi.       L,  K,  31
-#endif   
-    b       LSGEMM_L4x4_SUB2
-    MY_ALIGN
-LSGEMM_L4x4_SUB1:
-#if defined(TRMMKERNEL)
-    andi.       L,  T12,    15
-#else
-    andi.       L,  T12,    15
-#endif  
-    ble     LSGEMM_L4x4_SAVE
-    MY_ALIGN
-LSGEMM_L4x4_SUB2:
-
-    srawi.      T1,L, 3 
-    ble LSGEMM_L4x4_SUB2_4  
-    mtctr		T1
-    MY_ALIGN
-LSGEMM_L4x4_SUB2_LOOP:     
-    LOAD4x4_0
-    KERNEL4x4_I1_L4_2  16,16, 0,0
-    KERNEL4x4_I1_L4_3  16,16, 1,1
-    bdnz LSGEMM_L4x4_SUB2_LOOP
-    MY_ALIGN    
-LSGEMM_L4x4_SUB2_4:
-    andi.      T1,L, 4
-    ble LSGEMM_L4x4_SUB2_2
-    LOAD4x4_0
-    KERNEL4x4_I1_L4_3  16,16, 0,1
-    MY_ALIGN
-LSGEMM_L4x4_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L4x4_SUB2_1
-    LOAD4x4_0
-    KERNEL4x4_I1_L2_3  16,16, 0,1
-    MY_ALIGN    
-LSGEMM_L4x4_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L4x4_SAVE   
-    KERNEL4x4 0
- 
-
-    MY_ALIGN
-LSGEMM_L4x4_SAVE:
-    SAVE4x4
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4
-#endif  
-    MY_ALIGN  
-LSGEMM_L4x4_END:
-LSGEMM_L4x2_BEGIN:
-    andi.       T1, M,  2
-    ble     LSGEMM_L4x2_END
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 
-   srawi.       L, T11, 3 /**(T11) % 8x */
-#else
-   srawi.       L,  K,    3 /**(K) % 8x */
-#endif 
-    
-    ZERO4x2
-    ble     LSGEMM_L4x2_SUB0
-
-    MY_ALIGN
-LSGEMM_L4x2_LOOP_START: 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L4x2_LOOP:
-
-    KERNEL4x2_2  0,0, 0,0
-    KERNEL4x2_2  0,0, 1,0
-    KERNEL4x2_2  0,0, 2,0
-    KERNEL4x2_2  0,0, 3,1    
-
-    bdnz        LSGEMM_L4x2_LOOP
-
-    MY_ALIGN
-LSGEMM_L4x2_LOOP_END:   
- 
-LSGEMM_L4x2_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    7
-#else
-    andi.       L,  K,  7
-#endif    
-    ble     LSGEMM_L4x2_SAVE
-    MY_ALIGN
-LSGEMM_L4x2_SUB2:
-    andi.      T1,L, 4
-    ble LSGEMM_L4x2_SUB2_2
-    KERNEL4x2_2  0,0, 0,0
-    KERNEL4x2_2  0,0, 1,1
-    MY_ALIGN
-LSGEMM_L4x2_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L4x2_SUB2_1
-    KERNEL4x2_2  0,0, 0,1 
-    MY_ALIGN    
-LSGEMM_L4x2_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L4x2_SAVE   
-    KERNEL4x2
-  
-    MY_ALIGN
-LSGEMM_L4x2_SAVE:
-    SAVE4x2
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4
-#endif  
-    MY_ALIGN  
-LSGEMM_L4x2_END:
-LSGEMM_L4x1_BEGIN: 
-    andi.       T1, M,  1
-    ble     LSGEMM_L4x1_END
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
-#else
-    mr      BO, B
-#endif  
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 
-   srawi.       L, T11, 3 /**(T11) % 8x */
-#else
-   srawi.       L,  K,    3 /**(K) % 8x */
-#endif 
-    
-    ZERO4x1
-    ble     LSGEMM_L4x1_SUB0
-
-    MY_ALIGN
-LSGEMM_L4x1_LOOP_START: 
-    mtctr       L
-
-    MY_ALIGN
-
-LSGEMM_L4x1_LOOP:
-
-    KERNEL4x1_4  0,0, 0,0
-    KERNEL4x1_4  0,0, 1,1     
-
-    bdnz        LSGEMM_L4x1_LOOP
-
-    MY_ALIGN
-LSGEMM_L4x1_LOOP_END:   
- 
-LSGEMM_L4x1_SUB0:
-#if defined(TRMMKERNEL)
-    andi.       L,  T11,    7
-#else
-    andi.       L,  K,  7
-#endif    
-    ble     LSGEMM_L4x1_SAVE
-    MY_ALIGN
-LSGEMM_L4x1_SUB2:
-    andi.      T1,L, 4
-    ble LSGEMM_L4x1_SUB2_2
-    KERNEL4x1_4  0,0, 0,1 
-    MY_ALIGN
-LSGEMM_L4x1_SUB2_2:
-    andi.      T1,L, 2
-    ble LSGEMM_L4x1_SUB2_1
-    KERNEL4x1_2 
-    MY_ALIGN    
-LSGEMM_L4x1_SUB2_1:
-    andi.      T1,L, 1
-    ble LSGEMM_L4x1_SAVE   
-    KERNEL4x1
-  
-    MY_ALIGN
-LSGEMM_L4x1_SAVE:
-    SAVE4x1
-#if defined(TRMMKERNEL) 
-    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4
-#endif  
-    MY_ALIGN  
-LSGEMM_L4x1_END:
-
-	slwi		T1,	K,	4
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 4
-#endif
-
-	andi.		T2,	N,	3
-	ble		.L999
-
-LSGEMM_L4_END:
-    andi.       T1, N,  2
-    ble     LSGEMM_L2_END
-LSGEMM_L2_BEGIN:
-  
-
-	mr		AO,	A
-	mr		CO,	C
-	slwi		T3,	LDC	,	1
-	add		C,	C,	T3
- 
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	srawi.		I,	M,	4
-	ble		LSGEMM_L2x16_END
-
-	MY_ALIGN
-LSGEMM_L2x16_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO2x16
-	ble		LSGEMM_L2x16_SUB0
-    addi AO,AO,2048
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L2x16_LOOP:
-
-    KERNEL2x16_4  -2048,0, 0,0
-    KERNEL2x16_4  -2048,0, 1,0
-    KERNEL2x16_4  -2048,0, 2,0
-    KERNEL2x16_4  -2048,0, 3,0
-    KERNEL2x16_4  -2048,0, 4,0
-    KERNEL2x16_4  -2048,0, 5,0        
-    KERNEL2x16_4  -2048,0, 6,0
-    KERNEL2x16_4  -2048,0, 7,0  
-    KERNEL2x16_4  -2048,0, 8,0      
-    KERNEL2x16_4  -2048,0, 9,0
-    KERNEL2x16_4  -2048,0, 10,0
-    KERNEL2x16_4  -2048,0, 11,0
-    KERNEL2x16_4  -2048,0, 12,0
-    KERNEL2x16_4  -2048,0, 13,0    
-    KERNEL2x16_4  -2048,0, 14,0    
-    KERNEL2x16_4  -2048,0, 15,1  	
-
-	bdnz		LSGEMM_L2x16_LOOP
-    MY_ALIGN
-    addi AO,AO, -2048
-	MY_ALIGN
-LSGEMM_L2x16_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_L2x16_SAVE
-	MY_ALIGN
-LSGEMM_L2x16_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_L2x16_SUB2_16 
-    KERNEL2x16_4  0,0, 0,0
-    KERNEL2x16_4  0,0, 1,0
-    KERNEL2x16_4  0,0, 2,0
-    KERNEL2x16_4  0,0, 3,0
-    KERNEL2x16_4  0,0, 4,0
-    KERNEL2x16_4  0,0, 5,0
-    KERNEL2x16_4  0,0, 6,0
-    KERNEL2x16_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_L2x16_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L2x16_SUB2_8 
-    KERNEL2x16_4  0,0, 0,0
-    KERNEL2x16_4  0,0, 1,0
-    KERNEL2x16_4  0,0, 2,0
-    KERNEL2x16_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_L2x16_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L2x16_SUB2_4  
-    KERNEL2x16_4  0,0, 0,0
-    KERNEL2x16_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_L2x16_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L2x16_SUB2_2 
-    KERNEL2x16_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_L2x16_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L2x16_SUB2_1 
-    KERNEL2x16_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_L2x16_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L2x16_SAVE	
-    KERNEL2x16
-
-	MY_ALIGN
-LSGEMM_L2x16_SAVE:
-	SAVE2x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2
-#endif	
-	addic.		I,	I,	-1
-	bgt+		LSGEMM_L2x16_BEGIN
-    MY_ALIGN
-LSGEMM_L2x16_END:
-	andi.		I,	M,	8
-	ble		LSGEMM_L2x8_END
-
-	MY_ALIGN
-LSGEMM_L2x8_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO2x8
-	ble		LSGEMM_L2x8_SUB0
-    addi AO,AO,2048
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L2x8_LOOP:
-
-    KERNEL2x8_4  -2048,0, 0,0
-    KERNEL2x8_4  -2048,0, 1,0
-    KERNEL2x8_4  -2048,0, 2,0
-    KERNEL2x8_4  -2048,0, 3,0
-    KERNEL2x8_4  -2048,0, 4,0
-    KERNEL2x8_4  -2048,0, 5,0        
-    KERNEL2x8_4  -2048,0, 6,0
-    KERNEL2x8_4  -2048,0, 7,0  
-    KERNEL2x8_4  -2048,0, 8,0      
-    KERNEL2x8_4  -2048,0, 9,0
-    KERNEL2x8_4  -2048,0, 10,0
-    KERNEL2x8_4  -2048,0, 11,0
-    KERNEL2x8_4  -2048,0, 12,0
-    KERNEL2x8_4  -2048,0, 13,0    
-    KERNEL2x8_4  -2048,0, 14,0    
-    KERNEL2x8_4  -2048,0, 15,1  	
-
-	bdnz		LSGEMM_L2x8_LOOP
-    MY_ALIGN
-    addi AO,AO, -2048
-	MY_ALIGN
-LSGEMM_L2x8_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_L2x8_SAVE
-	MY_ALIGN
-LSGEMM_L2x8_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_L2x8_SUB2_16 
-    KERNEL2x8_4  0,0, 0,0
-    KERNEL2x8_4  0,0, 1,0
-    KERNEL2x8_4  0,0, 2,0
-    KERNEL2x8_4  0,0, 3,0
-    KERNEL2x8_4  0,0, 4,0
-    KERNEL2x8_4  0,0, 5,0
-    KERNEL2x8_4  0,0, 6,0
-    KERNEL2x8_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_L2x8_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L2x8_SUB2_8 
-    KERNEL2x8_4  0,0, 0,0
-    KERNEL2x8_4  0,0, 1,0
-    KERNEL2x8_4  0,0, 2,0
-    KERNEL2x8_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_L2x8_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L2x8_SUB2_4  
-    KERNEL2x8_4  0,0, 0,0
-    KERNEL2x8_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_L2x8_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L2x8_SUB2_2 
-    KERNEL2x8_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_L2x8_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L2x8_SUB2_1 
-    KERNEL2x8_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_L2x8_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L2x8_SAVE	
-    KERNEL2x8
-
-	MY_ALIGN
-LSGEMM_L2x8_SAVE:
-	SAVE2x8
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2
-#endif	 
-    MY_ALIGN
-LSGEMM_L2x8_END:
-	andi.		I,	M,	4
-	ble		LSGEMM_L2x4_END
-
-	MY_ALIGN
-LSGEMM_L2x4_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO2x4
-	ble		LSGEMM_L2x4_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L2x4_LOOP:
-
-    KERNEL2x4_4  0,0, 0,0
-    KERNEL2x4_4  0,0, 1,0
-    KERNEL2x4_4  0,0, 2,0
-    KERNEL2x4_4  0,0, 3,0
-    KERNEL2x4_4  0,0, 4,0
-    KERNEL2x4_4  0,0, 5,0        
-    KERNEL2x4_4  0,0, 6,0
-    KERNEL2x4_4  0,0, 7,0  
-    KERNEL2x4_4  0,0, 8,0      
-    KERNEL2x4_4  0,0, 9,0
-    KERNEL2x4_4  0,0, 10,0
-    KERNEL2x4_4  0,0, 11,0
-    KERNEL2x4_4  0,0, 12,0
-    KERNEL2x4_4  0,0, 13,0    
-    KERNEL2x4_4  0,0, 14,0    
-    KERNEL2x4_4  0,0, 15,1  	
-
-	bdnz		LSGEMM_L2x4_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_L2x4_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_L2x4_SAVE
-	MY_ALIGN
-LSGEMM_L2x4_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_L2x4_SUB2_16 
-    KERNEL2x4_4  0,0, 0,0
-    KERNEL2x4_4  0,0, 1,0
-    KERNEL2x4_4  0,0, 2,0
-    KERNEL2x4_4  0,0, 3,0
-    KERNEL2x4_4  0,0, 4,0
-    KERNEL2x4_4  0,0, 5,0
-    KERNEL2x4_4  0,0, 6,0
-    KERNEL2x4_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_L2x4_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L2x4_SUB2_8 
-    KERNEL2x4_4  0,0, 0,0
-    KERNEL2x4_4  0,0, 1,0
-    KERNEL2x4_4  0,0, 2,0
-    KERNEL2x4_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_L2x4_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L2x4_SUB2_4  
-    KERNEL2x4_4  0,0, 0,0
-    KERNEL2x4_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_L2x4_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L2x4_SUB2_2 
-    KERNEL2x4_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_L2x4_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L2x4_SUB2_1 
-    KERNEL2x4_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_L2x4_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L2x4_SAVE	
-    KERNEL2x4
-
-	MY_ALIGN
-LSGEMM_L2x4_SAVE:
-	SAVE2x4
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2
-#endif	 
-    MY_ALIGN
-LSGEMM_L2x4_END:
-	andi.		I,	M,	2
-	ble		LSGEMM_L2x2_END
-
-	MY_ALIGN
-LSGEMM_L2x2_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO2x2
-	ble		LSGEMM_L2x2_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L2x2_LOOP:
-
-    KERNEL2x2_4  0,0, 0,0
-    KERNEL2x2_4  0,0, 1,0
-    KERNEL2x2_4  0,0, 2,0
-    KERNEL2x2_4  0,0, 3,0
-    KERNEL2x2_4  0,0, 4,0
-    KERNEL2x2_4  0,0, 5,0        
-    KERNEL2x2_4  0,0, 6,0
-    KERNEL2x2_4  0,0, 7,0  
-    KERNEL2x2_4  0,0, 8,0      
-    KERNEL2x2_4  0,0, 9,0
-    KERNEL2x2_4  0,0, 10,0
-    KERNEL2x2_4  0,0, 11,0
-    KERNEL2x2_4  0,0, 12,0
-    KERNEL2x2_4  0,0, 13,0    
-    KERNEL2x2_4  0,0, 14,0    
-    KERNEL2x2_4  0,0, 15,1  	
-
-	bdnz		LSGEMM_L2x2_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_L2x2_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_L2x2_SAVE
-	MY_ALIGN
-LSGEMM_L2x2_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_L2x2_SUB2_16 
-    KERNEL2x2_4  0,0, 0,0
-    KERNEL2x2_4  0,0, 1,0
-    KERNEL2x2_4  0,0, 2,0
-    KERNEL2x2_4  0,0, 3,0
-    KERNEL2x2_4  0,0, 4,0
-    KERNEL2x2_4  0,0, 5,0
-    KERNEL2x2_4  0,0, 6,0
-    KERNEL2x2_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_L2x2_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L2x2_SUB2_8 
-    KERNEL2x2_4  0,0, 0,0
-    KERNEL2x2_4  0,0, 1,0
-    KERNEL2x2_4  0,0, 2,0
-    KERNEL2x2_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_L2x2_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L2x2_SUB2_4  
-    KERNEL2x2_4  0,0, 0,0
-    KERNEL2x2_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_L2x2_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L2x2_SUB2_2 
-    KERNEL2x2_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_L2x2_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L2x2_SUB2_1 
-    KERNEL2x2_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_L2x2_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L2x2_SAVE	
-    KERNEL2x2
-
-	MY_ALIGN
-LSGEMM_L2x2_SAVE:
-	SAVE2x2
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2
-#endif	 
-    MY_ALIGN
-LSGEMM_L2x2_END:
-	andi.		I,	M,	1
-	ble		LSGEMM_L2x1_END
-
-	MY_ALIGN
-LSGEMM_L2x1_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO2x1
-	ble		LSGEMM_L2x1_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_L2x1_LOOP:
-
-    KERNEL2x1_4  0,0, 0,0
-    KERNEL2x1_4  0,0, 1,0
-    KERNEL2x1_4  0,0, 2,0
-    KERNEL2x1_4  0,0, 3,0
-    KERNEL2x1_4  0,0, 4,0
-    KERNEL2x1_4  0,0, 5,0        
-    KERNEL2x1_4  0,0, 6,0
-    KERNEL2x1_4  0,0, 7,0  
-    KERNEL2x1_4  0,0, 8,0      
-    KERNEL2x1_4  0,0, 9,0
-    KERNEL2x1_4  0,0, 10,0
-    KERNEL2x1_4  0,0, 11,0
-    KERNEL2x1_4  0,0, 12,0
-    KERNEL2x1_4  0,0, 13,0    
-    KERNEL2x1_4  0,0, 14,0    
-    KERNEL2x1_4  0,0, 15,1  	
-
-	bdnz		LSGEMM_L2x1_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_L2x1_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_L2x1_SAVE
-	MY_ALIGN
-LSGEMM_L2x1_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_L2x1_SUB2_16 
-    KERNEL2x1_4  0,0, 0,0
-    KERNEL2x1_4  0,0, 1,0
-    KERNEL2x1_4  0,0, 2,0
-    KERNEL2x1_4  0,0, 3,0
-    KERNEL2x1_4  0,0, 4,0
-    KERNEL2x1_4  0,0, 5,0
-    KERNEL2x1_4  0,0, 6,0
-    KERNEL2x1_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_L2x1_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_L2x1_SUB2_8 
-    KERNEL2x1_4  0,0, 0,0
-    KERNEL2x1_4  0,0, 1,0
-    KERNEL2x1_4  0,0, 2,0
-    KERNEL2x1_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_L2x1_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_L2x1_SUB2_4  
-    KERNEL2x1_4  0,0, 0,0
-    KERNEL2x1_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_L2x1_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_L2x1_SUB2_2 
-    KERNEL2x1_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_L2x1_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_L2x1_SUB2_1 
-    KERNEL2x1_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_L2x1_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_L2x1_SAVE	
-    KERNEL2x1
-
-	MY_ALIGN
-LSGEMM_L2x1_SAVE:
-	SAVE2x1
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2
-#endif	 
-    MY_ALIGN
-LSGEMM_L2x1_END:
-	slwi		T1,	K,	3
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 2
-#endif 
-LSGEMM_L2_END:
-   andi.       T1, N,  1
-   ble     LSGEMM_END
-LSGEMM_1_BEGIN:
-  
-
-	mr		AO,	A
-	mr		CO,	C 
-	add		C,	C,	LDC
- 
-#if defined(TRMMKERNEL) && defined(LEFT)
-	mr TEMP_REG, OFFSET	 /*off = offset;*/
-#endif 
-	srawi.		I,	M,	4
-	ble		LSGEMM_1x16_END
-
-	MY_ALIGN
-LSGEMM_1x16_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO1x16
-	ble		LSGEMM_1x16_SUB0
-    addi AO,AO,2048
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_1x16_LOOP:
-
-    KERNEL1x16_4  -2048,0, 0,0
-    KERNEL1x16_4  -2048,0, 1,0
-    KERNEL1x16_4  -2048,0, 2,0
-    KERNEL1x16_4  -2048,0, 3,0
-    KERNEL1x16_4  -2048,0, 4,0
-    KERNEL1x16_4  -2048,0, 5,0        
-    KERNEL1x16_4  -2048,0, 6,0
-    KERNEL1x16_4  -2048,0, 7,0  
-    KERNEL1x16_4  -2048,0, 8,0      
-    KERNEL1x16_4  -2048,0, 9,0
-    KERNEL1x16_4  -2048,0, 10,0
-    KERNEL1x16_4  -2048,0, 11,0
-    KERNEL1x16_4  -2048,0, 12,0
-    KERNEL1x16_4  -2048,0, 13,0    
-    KERNEL1x16_4  -2048,0, 14,0    
-    KERNEL1x16_4  -2048,0, 15,1  	
-
-	bdnz		LSGEMM_1x16_LOOP
-    MY_ALIGN
-    addi AO,AO, -2048
-	MY_ALIGN
-LSGEMM_1x16_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_1x16_SAVE
-	MY_ALIGN
-LSGEMM_1x16_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_1x16_SUB2_16 
-    KERNEL1x16_4  0,0, 0,0
-    KERNEL1x16_4  0,0, 1,0
-    KERNEL1x16_4  0,0, 2,0
-    KERNEL1x16_4  0,0, 3,0
-    KERNEL1x16_4  0,0, 4,0
-    KERNEL1x16_4  0,0, 5,0
-    KERNEL1x16_4  0,0, 6,0
-    KERNEL1x16_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_1x16_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_1x16_SUB2_8 
-    KERNEL1x16_4  0,0, 0,0
-    KERNEL1x16_4  0,0, 1,0
-    KERNEL1x16_4  0,0, 2,0
-    KERNEL1x16_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_1x16_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_1x16_SUB2_4  
-    KERNEL1x16_4  0,0, 0,0
-    KERNEL1x16_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_1x16_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_1x16_SUB2_2 
-    KERNEL1x16_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_1x16_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_1x16_SUB2_1 
-    KERNEL1x16_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_1x16_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_1x16_SAVE	
-    KERNEL1x16
-
-	MY_ALIGN
-LSGEMM_1x16_SAVE:
-	SAVE1x16
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1
-#endif	
-	addic.		I,	I,	-1
-	bgt+		LSGEMM_1x16_BEGIN
-    MY_ALIGN
-LSGEMM_1x16_END:
-	andi.		I,	M,	8
-	ble		LSGEMM_1x8_END
-
-	MY_ALIGN
-LSGEMM_1x8_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO1x8
-	ble		LSGEMM_1x8_SUB0
-    addi AO,AO,2048
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_1x8_LOOP:
-
-    KERNEL1x8_4  -2048,0, 0,0
-    KERNEL1x8_4  -2048,0, 1,0
-    KERNEL1x8_4  -2048,0, 2,0
-    KERNEL1x8_4  -2048,0, 3,0
-    KERNEL1x8_4  -2048,0, 4,0
-    KERNEL1x8_4  -2048,0, 5,0        
-    KERNEL1x8_4  -2048,0, 6,0
-    KERNEL1x8_4  -2048,0, 7,0  
-    KERNEL1x8_4  -2048,0, 8,0      
-    KERNEL1x8_4  -2048,0, 9,0
-    KERNEL1x8_4  -2048,0, 10,0
-    KERNEL1x8_4  -2048,0, 11,0
-    KERNEL1x8_4  -2048,0, 12,0
-    KERNEL1x8_4  -2048,0, 13,0    
-    KERNEL1x8_4  -2048,0, 14,0    
-    KERNEL1x8_4  -2048,0, 15,1  	
-
-	bdnz		LSGEMM_1x8_LOOP
-    MY_ALIGN
-    addi AO,AO, -2048
-	MY_ALIGN
-LSGEMM_1x8_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_1x8_SAVE
-	MY_ALIGN
-LSGEMM_1x8_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_1x8_SUB2_16 
-    KERNEL1x8_4  0,0, 0,0
-    KERNEL1x8_4  0,0, 1,0
-    KERNEL1x8_4  0,0, 2,0
-    KERNEL1x8_4  0,0, 3,0
-    KERNEL1x8_4  0,0, 4,0
-    KERNEL1x8_4  0,0, 5,0
-    KERNEL1x8_4  0,0, 6,0
-    KERNEL1x8_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_1x8_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_1x8_SUB2_8 
-    KERNEL1x8_4  0,0, 0,0
-    KERNEL1x8_4  0,0, 1,0
-    KERNEL1x8_4  0,0, 2,0
-    KERNEL1x8_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_1x8_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_1x8_SUB2_4  
-    KERNEL1x8_4  0,0, 0,0
-    KERNEL1x8_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_1x8_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_1x8_SUB2_2 
-    KERNEL1x8_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_1x8_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_1x8_SUB2_1 
-    KERNEL1x8_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_1x8_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_1x8_SAVE	
-    KERNEL1x8
-
-	MY_ALIGN
-LSGEMM_1x8_SAVE:
-	SAVE1x8
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1
-#endif	 
-    MY_ALIGN
-LSGEMM_1x8_END:
-	andi.		I,	M,	4
-	ble		LSGEMM_1x4_END
-
-	MY_ALIGN
-LSGEMM_1x4_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO1x4
-	ble		LSGEMM_1x4_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_1x4_LOOP:
-
-    KERNEL1x4_4  0,0, 0,0
-    KERNEL1x4_4  0,0, 1,0
-    KERNEL1x4_4  0,0, 2,0
-    KERNEL1x4_4  0,0, 3,0
-    KERNEL1x4_4  0,0, 4,0
-    KERNEL1x4_4  0,0, 5,0        
-    KERNEL1x4_4  0,0, 6,0
-    KERNEL1x4_4  0,0, 7,0  
-    KERNEL1x4_4  0,0, 8,0      
-    KERNEL1x4_4  0,0, 9,0
-    KERNEL1x4_4  0,0, 10,0
-    KERNEL1x4_4  0,0, 11,0
-    KERNEL1x4_4  0,0, 12,0
-    KERNEL1x4_4  0,0, 13,0    
-    KERNEL1x4_4  0,0, 14,0    
-    KERNEL1x4_4  0,0, 15,1  	
-
-	bdnz		LSGEMM_1x4_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_1x4_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_1x4_SAVE
-	MY_ALIGN
-LSGEMM_1x4_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_1x4_SUB2_16 
-    KERNEL1x4_4  0,0, 0,0
-    KERNEL1x4_4  0,0, 1,0
-    KERNEL1x4_4  0,0, 2,0
-    KERNEL1x4_4  0,0, 3,0
-    KERNEL1x4_4  0,0, 4,0
-    KERNEL1x4_4  0,0, 5,0
-    KERNEL1x4_4  0,0, 6,0
-    KERNEL1x4_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_1x4_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_1x4_SUB2_8 
-    KERNEL1x4_4  0,0, 0,0
-    KERNEL1x4_4  0,0, 1,0
-    KERNEL1x4_4  0,0, 2,0
-    KERNEL1x4_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_1x4_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_1x4_SUB2_4  
-    KERNEL1x4_4  0,0, 0,0
-    KERNEL1x4_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_1x4_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_1x4_SUB2_2 
-    KERNEL1x4_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_1x4_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_1x4_SUB2_1 
-    KERNEL1x4_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_1x4_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_1x4_SAVE	
-    KERNEL1x4
-
-	MY_ALIGN
-LSGEMM_1x4_SAVE:
-	SAVE1x4
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1
-#endif	 
-    MY_ALIGN
-LSGEMM_1x4_END:
-	andi.		I,	M,	2
-	ble		LSGEMM_1x2_END
-
-	MY_ALIGN
-LSGEMM_1x2_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO1x2
-	ble		LSGEMM_1x2_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_1x2_LOOP:
-
-    KERNEL1x2_4  0,0, 0,0
-    KERNEL1x2_4  0,0, 1,0
-    KERNEL1x2_4  0,0, 2,0
-    KERNEL1x2_4  0,0, 3,0
-    KERNEL1x2_4  0,0, 4,0
-    KERNEL1x2_4  0,0, 5,0        
-    KERNEL1x2_4  0,0, 6,0
-    KERNEL1x2_4  0,0, 7,0  
-    KERNEL1x2_4  0,0, 8,0      
-    KERNEL1x2_4  0,0, 9,0
-    KERNEL1x2_4  0,0, 10,0
-    KERNEL1x2_4  0,0, 11,0
-    KERNEL1x2_4  0,0, 12,0
-    KERNEL1x2_4  0,0, 13,0    
-    KERNEL1x2_4  0,0, 14,0    
-    KERNEL1x2_4  0,0, 15,1  	
-
-	bdnz		LSGEMM_1x2_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_1x2_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_1x2_SAVE
-	MY_ALIGN
-LSGEMM_1x2_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_1x2_SUB2_16 
-    KERNEL1x2_4  0,0, 0,0
-    KERNEL1x2_4  0,0, 1,0
-    KERNEL1x2_4  0,0, 2,0
-    KERNEL1x2_4  0,0, 3,0
-    KERNEL1x2_4  0,0, 4,0
-    KERNEL1x2_4  0,0, 5,0
-    KERNEL1x2_4  0,0, 6,0
-    KERNEL1x2_4  0,0, 7,1 
-    MY_ALIGN        
-LSGEMM_1x2_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_1x2_SUB2_8 
-    KERNEL1x2_4  0,0, 0,0
-    KERNEL1x2_4  0,0, 1,0
-    KERNEL1x2_4  0,0, 2,0
-    KERNEL1x2_4  0,0, 3,1
-    MY_ALIGN 
-LSGEMM_1x2_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_1x2_SUB2_4  
-    KERNEL1x2_4  0,0, 0,0
-    KERNEL1x2_4  0,0, 1,1
-	MY_ALIGN	
-LSGEMM_1x2_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_1x2_SUB2_2 
-    KERNEL1x2_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_1x2_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_1x2_SUB2_1 
-    KERNEL1x2_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_1x2_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_1x2_SAVE	
-    KERNEL1x2
-
-	MY_ALIGN
-LSGEMM_1x2_SAVE:
-	SAVE1x2
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1
-#endif	 
-    MY_ALIGN
-LSGEMM_1x2_END:
-    andi.		I,	M,	1
-	ble		LSGEMM_1x1_END
-
-	MY_ALIGN
-LSGEMM_1x1_BEGIN:
-
-#if defined(TRMMKERNEL)
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
-#else
-	mr		BO,	B
-#endif	
-
-#if defined(TRMMKERNEL)
-   REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 
-   srawi.		L, T11,	6 /**(T11 ) % 64x */
-#else 
-   srawi.		L,	K,	6 /**(K ) % 64x */
-#endif 
- 
-    ZERO1x1
-	ble		LSGEMM_1x1_SUB0
- 
-  
-	mtctr		L
-
-	MY_ALIGN
-
-LSGEMM_1x1_LOOP:
-
-    KERNEL1x1_16  0,0, 0,0
-    KERNEL1x1_16  0,0, 1,0
-    KERNEL1x1_16  0,0, 2,0
-    KERNEL1x1_16  0,0, 3,1 	
-
-	bdnz		LSGEMM_1x1_LOOP
-    MY_ALIGN
- 
-	MY_ALIGN
-LSGEMM_1x1_SUB0: 
-#if defined(TRMMKERNEL)
-	andi.		L,	T11,	63
-#else
-	andi.		L,  K,	63
-#endif	
-	ble		LSGEMM_1x1_SAVE
-	MY_ALIGN
-LSGEMM_1x1_SUB2:
-    andi.      T10,L, 32
-    ble LSGEMM_1x1_SUB2_16 
-    KERNEL1x1_16  0,0, 0,0
-    KERNEL1x1_16  0,0, 1,1 
-    MY_ALIGN        
-LSGEMM_1x1_SUB2_16:
-    andi.      T10,L, 16
-    ble LSGEMM_1x1_SUB2_8 
-    KERNEL1x1_16  0,0, 0,1
-    MY_ALIGN 
-LSGEMM_1x1_SUB2_8:
-    andi.      T10,L, 8
-    ble LSGEMM_1x1_SUB2_4  
-    KERNEL1x1_8  0,0, 0,1
-	MY_ALIGN	
-LSGEMM_1x1_SUB2_4:
-    andi.      T10,L, 4
-    ble LSGEMM_1x1_SUB2_2 
-    KERNEL1x1_4  0,0, 0,1
-    MY_ALIGN
-LSGEMM_1x1_SUB2_2:
-    andi.      T10,L, 2
-    ble LSGEMM_1x1_SUB2_1 
-    KERNEL1x1_2  0,0, 0,1
-    MY_ALIGN    
-LSGEMM_1x1_SUB2_1:
-    andi.      T10,L, 1
-    ble LSGEMM_1x1_SAVE	
-    KERNEL1x1
-
-	MY_ALIGN
-LSGEMM_1x1_SAVE:
-	SAVE1x1
-#if defined(TRMMKERNEL)	
-	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1
-#endif	 
-    MY_ALIGN
-LSGEMM_1x1_END:
-	slwi		T1,	K,	2
-	add		B,	B,	T1
-#if defined(TRMMKERNEL) && !defined(LEFT)
-    addi TEMP_REG, TEMP_REG, 1
-#endif 
+#define MY_ALIGN .align 3
+b L8
+
+	MY_ALIGN
+LSGEMM_L8x16_LMAIN_SUB: 
+	LOAD8x16_2    
+	MY_ALIGN
+
+LSGEMM_L8x16_LOOP:
+    KERNEL8x16_L2 128,64,0,0 
+LSGEMM_L8x16_K128:
+    KERNEL8x16_L2 128,64,1,0 
+    KERNEL8x16_I1_L4_2  128,64, 1,0
+    KERNEL8x16_I1_L4_2  128,64, 2,0
+    KERNEL8x16_I1_L4_2  128,64, 3,0
+    KERNEL8x16_I1_L4_2  128,64, 4,0
+    KERNEL8x16_I1_L4_2  128,64, 5,0        
+    KERNEL8x16_I1_L4_2  128,64, 6,0
+    KERNEL8x16_I1_L4_2  128,64, 7,0  
+    KERNEL8x16_I1_L4_2  128,64, 8,0      
+    KERNEL8x16_I1_L4_2  128,64, 9,0
+    KERNEL8x16_I1_L4_2  128,64, 10,0
+    KERNEL8x16_I1_L4_2  128,64, 11,0
+    KERNEL8x16_I1_L4_2  128,64, 12,0
+    KERNEL8x16_I1_L4_2  128,64, 13,0    
+    KERNEL8x16_I1_L4_2  128,64, 14,0    
+    KERNEL8x16_I1_L4_2  128,64, 15,0  	
+    KERNEL8x16_I1_L4_2  128,64, 16,0
+    KERNEL8x16_I1_L4_2  128,64, 17,0
+    KERNEL8x16_I1_L4_2  128,64, 18,0
+    KERNEL8x16_I1_L4_2  128,64, 19,0
+    KERNEL8x16_I1_L4_2  128,64, 20,0
+    KERNEL8x16_I1_L4_2  128,64, 21,0        
+    KERNEL8x16_I1_L4_2  128,64, 22,0
+    KERNEL8x16_I1_L4_2  128,64, 23,0  
+    KERNEL8x16_I1_L4_2  128,64, 24,0      
+    KERNEL8x16_I1_L4_2  128,64, 25,0
+    KERNEL8x16_I1_L4_2  128,64, 26,0
+    KERNEL8x16_I1_L4_2  128,64, 27,0
+    KERNEL8x16_I1_L4_2  128,64, 28,0
+    KERNEL8x16_I1_L4_2  128,64, 29,0    
+    KERNEL8x16_I1_L4_2  128,64, 30,0    
+    KERNEL8x16_I1_L4_2  128,64, 31,1 
+	bdnz		LSGEMM_L8x16_LOOP
+
+	MY_ALIGN
+LSGEMM_L8x16_LOOP_END: 
+    END8x16_2
+    blr  
+
+	MY_ALIGN
+LSGEMM_L8x16_L64_SUB: 
+	LOAD8x16_2     
+    KERNEL8x16_I1_L4_2  128,64, 0,0
+    KERNEL8x16_I1_L4_2  128,64, 1,0
+    KERNEL8x16_I1_L4_2  128,64, 2,0
+    KERNEL8x16_I1_L4_2  128,64,3,0
+    KERNEL8x16_I1_L4_2  128,64,4,0
+    KERNEL8x16_I1_L4_2  128,64,5,0        
+    KERNEL8x16_I1_L4_2  128,64,6,0
+    KERNEL8x16_I1_L4_2  128,64,7,0  
+    KERNEL8x16_I1_L4_2  128,64,8,0      
+    KERNEL8x16_I1_L4_2  128,64,9,0
+    KERNEL8x16_I1_L4_2  128,64,10,0
+    KERNEL8x16_I1_L4_2  128,64,11,0
+    KERNEL8x16_I1_L4_2  128,64,12,0
+    KERNEL8x16_I1_L4_2  128,64,13,0    
+    KERNEL8x16_I1_L4_2  128,64,14,0    
+    KERNEL8x16_I1_L4_3  128,64,15,1 
+    blr	
+LSGEMM_L8x16_L32_SUB: 
+	LOAD8x16_2     
+    KERNEL8x16_I1_L4_2  128,64,0,0
+    KERNEL8x16_I1_L4_2  128,64,1,0
+    KERNEL8x16_I1_L4_2  128,64,2,0
+    KERNEL8x16_I1_L4_2  128,64,3,0
+    KERNEL8x16_I1_L4_2  128,64,4,0
+    KERNEL8x16_I1_L4_2  128,64,5,0        
+    KERNEL8x16_I1_L4_2  128,64,6,0
+    KERNEL8x16_I1_L4_3  128,64,7,1
+    blr	
+
+LSGEMM_L8x16_L16_SUB: 
+	LOAD8x16_2     
+    KERNEL8x16_I1_L4_2  128,64,0,0
+    KERNEL8x16_I1_L4_2  128,64,1,0
+    KERNEL8x16_I1_L4_2  128,64,2,0
+    KERNEL8x16_I1_L4_3  128,64,3,1
+    blr	
+
+L8:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   neg TEMP_REG, OFFSET 
+#endif
+
+	srawi.		J,	N,	3
+
+	ble		LSGEMM_L8_END
+
+LSGEMM_L8_BEGIN:
+
+	li		T1,	128
+	li		T2,	256
+ 
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	3
+	add		C,	C,	T3
+
+	dcbt		A,	T1
+	dcbt		A,	T2
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L8x16_END
+
+	MY_ALIGN
+LSGEMM_L8x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,8
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
+   mr T12, T11
+   addi T12,T12, -2
+   srawi.		L, T12,	7 /**(T11-2) % 128x */
+#else
+   mr T12, K
+   addi T12,T12, -2
+   srawi.		L,	T12,	7 /**(K-2) % 128x */
+#endif 
+ 
+    ZERO8x16 
+	ble		LSGEMM_L8x16_SUB0
+	mtctr		L 
+    bl      LSGEMM_L8x16_LMAIN_SUB
+	andi.		L,	T12,	127
+	ble		LSGEMM_L8x16_SAVE
+	b		LSGEMM_L8x16_SUB2   
+	MY_ALIGN
+LSGEMM_L8x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	255
+    cmpwi   T11,129
+#else
+	andi.		L,	K,	255
+    cmpwi   K,129
+#endif       
+    li T10,1
+    bne CMP8x16_128K
+    addi BO,BO,-32
+    addi AO,AO,-64 
+    LOAD8x16 64,32 
+    END8x16_WITHOUT_ADD   
+    LOAD8x16_2O AO,BO,  128, 64 
+    mtctr   T10   
+    bl LSGEMM_L8x16_K128   
+    b LSGEMM_L8x16_SAVE  
+CMP8x16_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T11,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne LSGEMM_L8x16_SUB2 
+    MY_ALIGN   
+    mtctr   T10
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD8x16_2O  AO,BO,  128,64
+    bl LSGEMM_L8x16_K128   
+    b LSGEMM_L8x16_SAVE
+	MY_ALIGN
+LSGEMM_L8x16_SUB2:
+    andi.   T10,L,64
+    ble   LSGEMM_L8x16_SUB2_32
+    bl   LSGEMM_L8x16_L64_SUB
+    MY_ALIGN 
+LSGEMM_L8x16_SUB2_32:
+    andi.      T10,L, 32
+    ble LSGEMM_L8x16_SUB2_16
+    bl   LSGEMM_L8x16_L32_SUB
+    MY_ALIGN                
+LSGEMM_L8x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L8x16_SUB2_8
+	bl  LSGEMM_L8x16_L16_SUB
+    MY_ALIGN 
+LSGEMM_L8x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L8x16_SUB2_4 
+	LOAD8x16_2
+    KERNEL8x16_I1_L4_2  128,64, 0,0
+    KERNEL8x16_I1_L4_3  128,64, 1,1
+	MY_ALIGN	
+LSGEMM_L8x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L8x16_SUB2_2
+    LOAD8x16_2
+    KERNEL8x16_I1_L4_3  128,64, 0,1
+    MY_ALIGN
+LSGEMM_L8x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L8x16_SUB2_1
+    LOAD8x16_2
+    KERNEL8x16_E2  128,64, 0,1
+    MY_ALIGN    
+LSGEMM_L8x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L8x16_SAVE	
+    KERNEL8x16 0
+
+
+	MY_ALIGN
+LSGEMM_L8x16_SAVE:
+	SAVE8x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,8
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L8x16_BEGIN
+    MY_ALIGN
+LSGEMM_L8x16_END:
+LSGEMM_L8x8_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L8x1_END
+
+    andi.       T1, M,  8
+    ble     LSGEMM_L8x8_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,8
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO8x8
+    ble     LSGEMM_L8x8_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x8_LOOP_START:
+ 
+    LOAD8x8_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x8_LOOP:
+
+    KERNEL8x8_I1_L4_2  32,32, 0,0
+    KERNEL8x8_I1_L4_2  32,32, 1,0
+    KERNEL8x8_I1_L4_2  32,32, 2,0
+    KERNEL8x8_I1_L4_2  32,32, 3,1    
+
+    bdnz        LSGEMM_L8x8_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x8_LOOP_END:
+
+    END8x8 0, AO, BO, 32, 32    
+
+    b       LSGEMM_L8x8_SUB1 
+    MY_ALIGN
+LSGEMM_L8x8_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L8x8_SUB2
+    MY_ALIGN
+LSGEMM_L8x8_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L8x8_SAVE
+    MY_ALIGN
+LSGEMM_L8x8_SUB2:
+ 
+    srawi.      T1,L, 3
+    ble LSGEMM_L8x8_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L8x8_SUB2_LOOP:    
+    LOAD8x8_0
+    KERNEL8x8_I1_L4_2  32,32, 0,0
+    KERNEL8x8_I1_L4_3  32,32, 1,1
+    bdnz LSGEMM_L8x8_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L8x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x8_SUB2_2
+    LOAD8x8_0
+    KERNEL8x8_I1_L4_3  32,32, 0,1
+    MY_ALIGN
+LSGEMM_L8x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x8_SUB2_1
+    LOAD8x8_0
+    KERNEL8x8_I1_L2_3  32,32, 0,1
+    MY_ALIGN    
+LSGEMM_L8x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x8_SAVE   
+    KERNEL8x8 0
+ 
+
+    MY_ALIGN
+LSGEMM_L8x8_SAVE:
+    SAVE8x8
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x8_END:
+LSGEMM_L8x4_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L8x1_END
+
+    andi.       T1, M,  4
+    ble     LSGEMM_L8x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,8
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO8x4
+    ble     LSGEMM_L8x4_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x4_LOOP_START:
+ 
+    LOAD8x4_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x4_LOOP:
+
+    KERNEL8x4_I1_L4_2  16,32, 0,0
+    KERNEL8x4_I1_L4_2  16,32, 1,0
+    KERNEL8x4_I1_L4_2  16,32, 2,0
+    KERNEL8x4_I1_L4_2  16,32, 3,1    
+
+    bdnz        LSGEMM_L8x4_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x4_LOOP_END:
+
+    END8x4 0, AO, BO, 16, 32    
+
+    b       LSGEMM_L8x4_SUB1 
+    MY_ALIGN
+LSGEMM_L8x4_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L8x4_SUB2
+    MY_ALIGN
+LSGEMM_L8x4_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L8x4_SAVE
+    MY_ALIGN
+LSGEMM_L8x4_SUB2:
+
+    srawi.      T1,L, 3
+    ble LSGEMM_L8x4_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L8x4_SUB2_LOOP:      
+    LOAD8x4_0
+    KERNEL8x4_I1_L4_2  16,32, 0,0
+    KERNEL8x4_I1_L4_3  16,32, 1,1
+    bdnz LSGEMM_L8x4_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L8x4_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x4_SUB2_2
+    LOAD8x4_0
+    KERNEL8x4_I1_L4_3  16,32, 0,1
+    MY_ALIGN
+LSGEMM_L8x4_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x4_SUB2_1
+    LOAD8x4_0
+    KERNEL8x4_I1_L2_3  16,32, 0,1
+    MY_ALIGN    
+LSGEMM_L8x4_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x4_SAVE   
+    KERNEL8x4 0
+ 
+
+    MY_ALIGN
+LSGEMM_L8x4_SAVE:
+    SAVE8x4
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x4_END:
+LSGEMM_L8x2_BEGIN:
+    andi.       T1, M,  2
+    ble     LSGEMM_L8x2_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,8 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO8x2
+    ble     LSGEMM_L8x2_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x2_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x2_LOOP:
+
+    KERNEL8x2_2  0,0, 0,0
+    KERNEL8x2_2  0,0, 1,0
+    KERNEL8x2_2  0,0, 2,0
+    KERNEL8x2_2  0,0, 3,1    
+
+    bdnz        LSGEMM_L8x2_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x2_LOOP_END:   
+ 
+LSGEMM_L8x2_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L8x2_SAVE
+    MY_ALIGN
+LSGEMM_L8x2_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x2_SUB2_2
+    KERNEL8x2_2  0,0, 0,0
+    KERNEL8x2_2  0,0, 1,1
+    MY_ALIGN
+LSGEMM_L8x2_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x2_SUB2_1
+    KERNEL8x2_2  0,0, 0,1 
+    MY_ALIGN    
+LSGEMM_L8x2_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x2_SAVE   
+    KERNEL8x2
+  
+    MY_ALIGN
+LSGEMM_L8x2_SAVE:
+    SAVE8x2
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x2_END:
+LSGEMM_L8x1_BEGIN: 
+    andi.       T1, M,  1
+    ble     LSGEMM_L8x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,8
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,8 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO8x1
+    ble     LSGEMM_L8x1_SUB0
+
+    MY_ALIGN
+LSGEMM_L8x1_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L8x1_LOOP:
+
+    KERNEL8x1_4  0,0, 0,0
+    KERNEL8x1_4  0,0, 1,1     
+
+    bdnz        LSGEMM_L8x1_LOOP
+
+    MY_ALIGN
+LSGEMM_L8x1_LOOP_END:   
+ 
+LSGEMM_L8x1_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L8x1_SAVE
+    MY_ALIGN
+LSGEMM_L8x1_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L8x1_SUB2_2
+    KERNEL8x1_4  0,0, 0,1 
+    MY_ALIGN
+LSGEMM_L8x1_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L8x1_SUB2_1
+    KERNEL8x1_2 
+    MY_ALIGN    
+LSGEMM_L8x1_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L8x1_SAVE   
+    KERNEL8x1
+  
+    MY_ALIGN
+LSGEMM_L8x1_SAVE:
+    SAVE8x1
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,8
+#endif  
+    MY_ALIGN  
+LSGEMM_L8x1_END:
+
+	slwi		T1,	K,	5
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 8
+#endif
+	addic.		J,	J,	-1
+	bgt		LSGEMM_L8_BEGIN
+ 
+
+LSGEMM_L8_END:
+
+/*	b		LSGEMM_L4_BEGIN*/
+    andi.       T1, N,  4
+    ble     LSGEMM_L4_END
+LSGEMM_L4_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	2
+	add		C,	C,	T3
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L4x16_END
+
+	MY_ALIGN
+LSGEMM_L4x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,4
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.		L, T12,	6 /**(T11-1) % 64x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.		L,	T12,	6 /**(K-1) % 64x */
+#endif 
+ 
+    ZERO4x16
+	ble		LSGEMM_L4x16_SUB0
+
+	MY_ALIGN
+LSGEMM_L4x16_LOOP_START:
+ 
+	LOAD4x16_0  /*we already zeroed */
+    ##OffsetA=64 OffsetB=16
+    addi AO,AO,2112
+    addi BO,BO,16  
+
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L4x16_LOOP:
+
+    KERNEL4x16_I1_L4_2  -2048,0, 0,0
+    KERNEL4x16_I1_L4_2  -2048,0, 1,0
+    KERNEL4x16_I1_L4_2  -2048,0, 2,0
+    KERNEL4x16_I1_L4_2  -2048,0, 3,0
+    KERNEL4x16_I1_L4_2  -2048,0, 4,0
+    KERNEL4x16_I1_L4_2  -2048,0, 5,0        
+    KERNEL4x16_I1_L4_2  -2048,0, 6,0
+    KERNEL4x16_I1_L4_2  -2048,0, 7,0  
+    KERNEL4x16_I1_L4_2  -2048,0, 8,0      
+    KERNEL4x16_I1_L4_2  -2048,0, 9,0
+    KERNEL4x16_I1_L4_2  -2048,0, 10,0
+    KERNEL4x16_I1_L4_2  -2048,0, 11,0
+    KERNEL4x16_I1_L4_2  -2048,0, 12,0
+    KERNEL4x16_I1_L4_2  -2048,0, 13,0    
+    KERNEL4x16_I1_L4_2  -2048,0, 14,0    
+    KERNEL4x16_I1_L4_2  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L4x16_LOOP
+
+	MY_ALIGN
+LSGEMM_L4x16_LOOP_END:
+
+    END4x16 0, AO, BO, -2048, 0    
+
+	b		LSGEMM_L4x16_SUB1 
+	MY_ALIGN
+LSGEMM_L4x16_SUB0:
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	127
+#else
+	andi.		L,	K,	127
+#endif   
+	b		LSGEMM_L4x16_SUB2
+	MY_ALIGN
+LSGEMM_L4x16_SUB1:
+#if defined(TRMMKERNEL)
+	andi.		L,	T12,	63
+#else
+	andi.		L,  T12,	63
+#endif	
+	ble		LSGEMM_L4x16_SAVE
+	MY_ALIGN
+LSGEMM_L4x16_SUB2:
+
+    srawi.      T10,L, 5
+    ble LSGEMM_L4x16_SUB2_16
+    mtctr		T10
+    MY_ALIGN
+LSGEMM_L4x16_SUB2_LOOP:
+	LOAD4x16_0 
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_2  64,16, 1,0
+    KERNEL4x16_I1_L4_2  64,16, 2,0
+    KERNEL4x16_I1_L4_2  64,16, 3,0
+    KERNEL4x16_I1_L4_2  64,16, 4,0
+    KERNEL4x16_I1_L4_2  64,16, 5,0
+    KERNEL4x16_I1_L4_2  64,16, 6,0
+    KERNEL4x16_I1_L4_3  64,16, 7,1
+    bdnz LSGEMM_L4x16_SUB2_LOOP 
+    MY_ALIGN        
+LSGEMM_L4x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L4x16_SUB2_8
+	LOAD4x16_0 
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_2  64,16, 1,0
+    KERNEL4x16_I1_L4_2  64,16, 2,0
+    KERNEL4x16_I1_L4_3  64,16, 3,1
+    MY_ALIGN 
+LSGEMM_L4x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L4x16_SUB2_4 
+	LOAD4x16_0
+    KERNEL4x16_I1_L4_2  64,16, 0,0
+    KERNEL4x16_I1_L4_3  64,16, 1,1
+	MY_ALIGN	
+LSGEMM_L4x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L4x16_SUB2_2
+    LOAD4x16_0
+    KERNEL4x16_I1_L4_3  64,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L4x16_SUB2_1
+    LOAD4x16_0
+    KERNEL4x16_I1_L2_3  64,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L4x16_SAVE	
+    KERNEL4x16 0
+#	addic.		L,	L,	-1
+#	bgt		LSGEMM_L4x16_SUB2
+
+	MY_ALIGN
+LSGEMM_L4x16_SAVE:
+	SAVE4x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,4
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L4x16_BEGIN
+    MY_ALIGN
+LSGEMM_L4x16_END:
+LSGEMM_L4x8_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L4x1_END
+
+    andi.       T1, M,  8
+    ble     LSGEMM_L4x8_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO4x8
+    ble     LSGEMM_L4x8_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x8_LOOP_START:
+ 
+    LOAD4x8_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x8_LOOP:
+
+    KERNEL4x8_I1_L4_2  32,16, 0,0
+    KERNEL4x8_I1_L4_2  32,16, 1,0
+    KERNEL4x8_I1_L4_2  32,16, 2,0
+    KERNEL4x8_I1_L4_2  32,16, 3,1    
+
+    bdnz        LSGEMM_L4x8_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x8_LOOP_END:
+
+    END4x8 0, AO, BO, 32, 16    
+
+    b       LSGEMM_L4x8_SUB1 
+    MY_ALIGN
+LSGEMM_L4x8_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L4x8_SUB2
+    MY_ALIGN
+LSGEMM_L4x8_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L4x8_SAVE
+    MY_ALIGN
+LSGEMM_L4x8_SUB2:
+ 
+    srawi.      T1,L, 3
+    ble LSGEMM_L4x8_SUB2_4 
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L4x8_SUB2_LOOP:    
+    LOAD4x8_0
+    KERNEL4x8_I1_L4_2  32,16, 0,0
+    KERNEL4x8_I1_L4_3  32,16, 1,1
+    bdnz LSGEMM_L4x8_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L4x8_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x8_SUB2_2
+    LOAD4x8_0
+    KERNEL4x8_I1_L4_3  32,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x8_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x8_SUB2_1
+    LOAD4x8_0
+    KERNEL4x8_I1_L2_3  32,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x8_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x8_SAVE   
+    KERNEL4x8 0
+ 
+
+    MY_ALIGN
+LSGEMM_L4x8_SAVE:
+    SAVE4x8
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x8_END:
+LSGEMM_L4x4_BEGIN:
+    andi.       T2, M,  15
+    ble     LSGEMM_L4x1_END
+
+    andi.       T1, M,  4
+    ble     LSGEMM_L4x4_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,4
+   mr T12, T11
+   addi T12,T12, -1
+   srawi.       L, T12, 4 /**(T11-1) % 16x */
+#else
+   mr T12, K
+   addi T12,T12, -1
+   srawi.       L,  T12,    4 /**(K-1) % 16x */
+#endif 
+    
+    ZERO4x4
+    ble     LSGEMM_L4x4_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x4_LOOP_START:
+ 
+    LOAD4x4_0  /*we already zeroed */ 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x4_LOOP:
+
+    KERNEL4x4_I1_L4_2  16,16, 0,0
+    KERNEL4x4_I1_L4_2  16,16, 1,0
+    KERNEL4x4_I1_L4_2  16,16, 2,0
+    KERNEL4x4_I1_L4_2  16,16, 3,1    
+
+    bdnz        LSGEMM_L4x4_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x4_LOOP_END:
+
+    END4x4 0, AO, BO, 16, 16    
+
+    b       LSGEMM_L4x4_SUB1 
+    MY_ALIGN
+LSGEMM_L4x4_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    31
+#else
+    andi.       L,  K,  31
+#endif   
+    b       LSGEMM_L4x4_SUB2
+    MY_ALIGN
+LSGEMM_L4x4_SUB1:
+#if defined(TRMMKERNEL)
+    andi.       L,  T12,    15
+#else
+    andi.       L,  T12,    15
+#endif  
+    ble     LSGEMM_L4x4_SAVE
+    MY_ALIGN
+LSGEMM_L4x4_SUB2:
+
+    srawi.      T1,L, 3 
+    ble LSGEMM_L4x4_SUB2_4  
+    mtctr		T1
+    MY_ALIGN
+LSGEMM_L4x4_SUB2_LOOP:     
+    LOAD4x4_0
+    KERNEL4x4_I1_L4_2  16,16, 0,0
+    KERNEL4x4_I1_L4_3  16,16, 1,1
+    bdnz LSGEMM_L4x4_SUB2_LOOP
+    MY_ALIGN    
+LSGEMM_L4x4_SUB2_4:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x4_SUB2_2
+    LOAD4x4_0
+    KERNEL4x4_I1_L4_3  16,16, 0,1
+    MY_ALIGN
+LSGEMM_L4x4_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x4_SUB2_1
+    LOAD4x4_0
+    KERNEL4x4_I1_L2_3  16,16, 0,1
+    MY_ALIGN    
+LSGEMM_L4x4_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x4_SAVE   
+    KERNEL4x4 0
+ 
+
+    MY_ALIGN
+LSGEMM_L4x4_SAVE:
+    SAVE4x4
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x4_END:
+LSGEMM_L4x2_BEGIN:
+    andi.       T1, M,  2
+    ble     LSGEMM_L4x2_END
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,4 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO4x2
+    ble     LSGEMM_L4x2_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x2_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x2_LOOP:
+
+    KERNEL4x2_2  0,0, 0,0
+    KERNEL4x2_2  0,0, 1,0
+    KERNEL4x2_2  0,0, 2,0
+    KERNEL4x2_2  0,0, 3,1    
+
+    bdnz        LSGEMM_L4x2_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x2_LOOP_END:   
+ 
+LSGEMM_L4x2_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L4x2_SAVE
+    MY_ALIGN
+LSGEMM_L4x2_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x2_SUB2_2
+    KERNEL4x2_2  0,0, 0,0
+    KERNEL4x2_2  0,0, 1,1
+    MY_ALIGN
+LSGEMM_L4x2_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x2_SUB2_1
+    KERNEL4x2_2  0,0, 0,1 
+    MY_ALIGN    
+LSGEMM_L4x2_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x2_SAVE   
+    KERNEL4x2
+  
+    MY_ALIGN
+LSGEMM_L4x2_SAVE:
+    SAVE4x2
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x2_END:
+LSGEMM_L4x1_BEGIN: 
+    andi.       T1, M,  1
+    ble     LSGEMM_L4x1_END
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,4
+#else
+    mr      BO, B
+#endif  
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,4 
+   srawi.       L, T11, 3 /**(T11) % 8x */
+#else
+   srawi.       L,  K,    3 /**(K) % 8x */
+#endif 
+    
+    ZERO4x1
+    ble     LSGEMM_L4x1_SUB0
+
+    MY_ALIGN
+LSGEMM_L4x1_LOOP_START: 
+    mtctr       L
+
+    MY_ALIGN
+
+LSGEMM_L4x1_LOOP:
+
+    KERNEL4x1_4  0,0, 0,0
+    KERNEL4x1_4  0,0, 1,1     
+
+    bdnz        LSGEMM_L4x1_LOOP
+
+    MY_ALIGN
+LSGEMM_L4x1_LOOP_END:   
+ 
+LSGEMM_L4x1_SUB0:
+#if defined(TRMMKERNEL)
+    andi.       L,  T11,    7
+#else
+    andi.       L,  K,  7
+#endif    
+    ble     LSGEMM_L4x1_SAVE
+    MY_ALIGN
+LSGEMM_L4x1_SUB2:
+    andi.      T1,L, 4
+    ble LSGEMM_L4x1_SUB2_2
+    KERNEL4x1_4  0,0, 0,1 
+    MY_ALIGN
+LSGEMM_L4x1_SUB2_2:
+    andi.      T1,L, 2
+    ble LSGEMM_L4x1_SUB2_1
+    KERNEL4x1_2 
+    MY_ALIGN    
+LSGEMM_L4x1_SUB2_1:
+    andi.      T1,L, 1
+    ble LSGEMM_L4x1_SAVE   
+    KERNEL4x1
+  
+    MY_ALIGN
+LSGEMM_L4x1_SAVE:
+    SAVE4x1
+#if defined(TRMMKERNEL) 
+    REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,4
+#endif  
+    MY_ALIGN  
+LSGEMM_L4x1_END:
+
+	slwi		T1,	K,	4
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 4
+#endif
+
+	andi.		T2,	N,	3
+	ble		.L999
+
+LSGEMM_L4_END:
+    andi.       T1, N,  2
+    ble     LSGEMM_L2_END
+LSGEMM_L2_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C
+	slwi		T3,	LDC	,	1
+	add		C,	C,	T3
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_L2x16_END
+
+	MY_ALIGN
+LSGEMM_L2x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x16
+	ble		LSGEMM_L2x16_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x16_LOOP:
+
+    KERNEL2x16_4  -2048,0, 0,0
+    KERNEL2x16_4  -2048,0, 1,0
+    KERNEL2x16_4  -2048,0, 2,0
+    KERNEL2x16_4  -2048,0, 3,0
+    KERNEL2x16_4  -2048,0, 4,0
+    KERNEL2x16_4  -2048,0, 5,0        
+    KERNEL2x16_4  -2048,0, 6,0
+    KERNEL2x16_4  -2048,0, 7,0  
+    KERNEL2x16_4  -2048,0, 8,0      
+    KERNEL2x16_4  -2048,0, 9,0
+    KERNEL2x16_4  -2048,0, 10,0
+    KERNEL2x16_4  -2048,0, 11,0
+    KERNEL2x16_4  -2048,0, 12,0
+    KERNEL2x16_4  -2048,0, 13,0    
+    KERNEL2x16_4  -2048,0, 14,0    
+    KERNEL2x16_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L2x16_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_L2x16_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x16_SAVE
+	MY_ALIGN
+LSGEMM_L2x16_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x16_SUB2_16 
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,0
+    KERNEL2x16_4  0,0, 2,0
+    KERNEL2x16_4  0,0, 3,0
+    KERNEL2x16_4  0,0, 4,0
+    KERNEL2x16_4  0,0, 5,0
+    KERNEL2x16_4  0,0, 6,0
+    KERNEL2x16_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x16_SUB2_8 
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,0
+    KERNEL2x16_4  0,0, 2,0
+    KERNEL2x16_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x16_SUB2_4  
+    KERNEL2x16_4  0,0, 0,0
+    KERNEL2x16_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x16_SUB2_2 
+    KERNEL2x16_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x16_SUB2_1 
+    KERNEL2x16_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x16_SAVE	
+    KERNEL2x16
+
+	MY_ALIGN
+LSGEMM_L2x16_SAVE:
+	SAVE2x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,2
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_L2x16_BEGIN
+    MY_ALIGN
+LSGEMM_L2x16_END:
+	andi.		I,	M,	8
+	ble		LSGEMM_L2x8_END
+
+	MY_ALIGN
+LSGEMM_L2x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x8
+	ble		LSGEMM_L2x8_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x8_LOOP:
+
+    KERNEL2x8_4  -2048,0, 0,0
+    KERNEL2x8_4  -2048,0, 1,0
+    KERNEL2x8_4  -2048,0, 2,0
+    KERNEL2x8_4  -2048,0, 3,0
+    KERNEL2x8_4  -2048,0, 4,0
+    KERNEL2x8_4  -2048,0, 5,0        
+    KERNEL2x8_4  -2048,0, 6,0
+    KERNEL2x8_4  -2048,0, 7,0  
+    KERNEL2x8_4  -2048,0, 8,0      
+    KERNEL2x8_4  -2048,0, 9,0
+    KERNEL2x8_4  -2048,0, 10,0
+    KERNEL2x8_4  -2048,0, 11,0
+    KERNEL2x8_4  -2048,0, 12,0
+    KERNEL2x8_4  -2048,0, 13,0    
+    KERNEL2x8_4  -2048,0, 14,0    
+    KERNEL2x8_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_L2x8_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_L2x8_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x8_SAVE
+	MY_ALIGN
+LSGEMM_L2x8_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x8_SUB2_16 
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,0
+    KERNEL2x8_4  0,0, 2,0
+    KERNEL2x8_4  0,0, 3,0
+    KERNEL2x8_4  0,0, 4,0
+    KERNEL2x8_4  0,0, 5,0
+    KERNEL2x8_4  0,0, 6,0
+    KERNEL2x8_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x8_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x8_SUB2_8 
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,0
+    KERNEL2x8_4  0,0, 2,0
+    KERNEL2x8_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x8_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x8_SUB2_4  
+    KERNEL2x8_4  0,0, 0,0
+    KERNEL2x8_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x8_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x8_SUB2_2 
+    KERNEL2x8_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x8_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x8_SUB2_1 
+    KERNEL2x8_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x8_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x8_SAVE	
+    KERNEL2x8
+
+	MY_ALIGN
+LSGEMM_L2x8_SAVE:
+	SAVE2x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x8_END:
+	andi.		I,	M,	4
+	ble		LSGEMM_L2x4_END
+
+	MY_ALIGN
+LSGEMM_L2x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x4
+	ble		LSGEMM_L2x4_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x4_LOOP:
+
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,0
+    KERNEL2x4_4  0,0, 4,0
+    KERNEL2x4_4  0,0, 5,0        
+    KERNEL2x4_4  0,0, 6,0
+    KERNEL2x4_4  0,0, 7,0  
+    KERNEL2x4_4  0,0, 8,0      
+    KERNEL2x4_4  0,0, 9,0
+    KERNEL2x4_4  0,0, 10,0
+    KERNEL2x4_4  0,0, 11,0
+    KERNEL2x4_4  0,0, 12,0
+    KERNEL2x4_4  0,0, 13,0    
+    KERNEL2x4_4  0,0, 14,0    
+    KERNEL2x4_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x4_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x4_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x4_SAVE
+	MY_ALIGN
+LSGEMM_L2x4_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x4_SUB2_16 
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,0
+    KERNEL2x4_4  0,0, 4,0
+    KERNEL2x4_4  0,0, 5,0
+    KERNEL2x4_4  0,0, 6,0
+    KERNEL2x4_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x4_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x4_SUB2_8 
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,0
+    KERNEL2x4_4  0,0, 2,0
+    KERNEL2x4_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x4_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x4_SUB2_4  
+    KERNEL2x4_4  0,0, 0,0
+    KERNEL2x4_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x4_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x4_SUB2_2 
+    KERNEL2x4_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x4_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x4_SUB2_1 
+    KERNEL2x4_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x4_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x4_SAVE	
+    KERNEL2x4
+
+	MY_ALIGN
+LSGEMM_L2x4_SAVE:
+	SAVE2x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x4_END:
+	andi.		I,	M,	2
+	ble		LSGEMM_L2x2_END
+
+	MY_ALIGN
+LSGEMM_L2x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x2
+	ble		LSGEMM_L2x2_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x2_LOOP:
+
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,0
+    KERNEL2x2_4  0,0, 4,0
+    KERNEL2x2_4  0,0, 5,0        
+    KERNEL2x2_4  0,0, 6,0
+    KERNEL2x2_4  0,0, 7,0  
+    KERNEL2x2_4  0,0, 8,0      
+    KERNEL2x2_4  0,0, 9,0
+    KERNEL2x2_4  0,0, 10,0
+    KERNEL2x2_4  0,0, 11,0
+    KERNEL2x2_4  0,0, 12,0
+    KERNEL2x2_4  0,0, 13,0    
+    KERNEL2x2_4  0,0, 14,0    
+    KERNEL2x2_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x2_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x2_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x2_SAVE
+	MY_ALIGN
+LSGEMM_L2x2_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x2_SUB2_16 
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,0
+    KERNEL2x2_4  0,0, 4,0
+    KERNEL2x2_4  0,0, 5,0
+    KERNEL2x2_4  0,0, 6,0
+    KERNEL2x2_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x2_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x2_SUB2_8 
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,0
+    KERNEL2x2_4  0,0, 2,0
+    KERNEL2x2_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x2_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x2_SUB2_4  
+    KERNEL2x2_4  0,0, 0,0
+    KERNEL2x2_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x2_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x2_SUB2_2 
+    KERNEL2x2_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x2_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x2_SUB2_1 
+    KERNEL2x2_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x2_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x2_SAVE	
+    KERNEL2x2
+
+	MY_ALIGN
+LSGEMM_L2x2_SAVE:
+	SAVE2x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x2_END:
+	andi.		I,	M,	1
+	ble		LSGEMM_L2x1_END
+
+	MY_ALIGN
+LSGEMM_L2x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,2 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO2x1
+	ble		LSGEMM_L2x1_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_L2x1_LOOP:
+
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,0
+    KERNEL2x1_4  0,0, 4,0
+    KERNEL2x1_4  0,0, 5,0        
+    KERNEL2x1_4  0,0, 6,0
+    KERNEL2x1_4  0,0, 7,0  
+    KERNEL2x1_4  0,0, 8,0      
+    KERNEL2x1_4  0,0, 9,0
+    KERNEL2x1_4  0,0, 10,0
+    KERNEL2x1_4  0,0, 11,0
+    KERNEL2x1_4  0,0, 12,0
+    KERNEL2x1_4  0,0, 13,0    
+    KERNEL2x1_4  0,0, 14,0    
+    KERNEL2x1_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_L2x1_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_L2x1_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_L2x1_SAVE
+	MY_ALIGN
+LSGEMM_L2x1_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_L2x1_SUB2_16 
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,0
+    KERNEL2x1_4  0,0, 4,0
+    KERNEL2x1_4  0,0, 5,0
+    KERNEL2x1_4  0,0, 6,0
+    KERNEL2x1_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_L2x1_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_L2x1_SUB2_8 
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,0
+    KERNEL2x1_4  0,0, 2,0
+    KERNEL2x1_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_L2x1_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_L2x1_SUB2_4  
+    KERNEL2x1_4  0,0, 0,0
+    KERNEL2x1_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_L2x1_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_L2x1_SUB2_2 
+    KERNEL2x1_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_L2x1_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_L2x1_SUB2_1 
+    KERNEL2x1_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_L2x1_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_L2x1_SAVE	
+    KERNEL2x1
+
+	MY_ALIGN
+LSGEMM_L2x1_SAVE:
+	SAVE2x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,2
+#endif	 
+    MY_ALIGN
+LSGEMM_L2x1_END:
+	slwi		T1,	K,	3
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 2
+#endif 
+LSGEMM_L2_END:
+   andi.       T1, N,  1
+   ble     LSGEMM_END
+LSGEMM_1_BEGIN:
+  
+
+	mr		AO,	A
+	mr		CO,	C 
+	add		C,	C,	LDC
+ 
+#if defined(TRMMKERNEL) && defined(LEFT)
+	mr TEMP_REG, OFFSET	 /*off = offset;*/
+#endif 
+	srawi.		I,	M,	4
+	ble		LSGEMM_1x16_END
+
+	MY_ALIGN
+LSGEMM_1x16_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,16,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,16,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x16
+	ble		LSGEMM_1x16_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x16_LOOP:
+
+    KERNEL1x16_4  -2048,0, 0,0
+    KERNEL1x16_4  -2048,0, 1,0
+    KERNEL1x16_4  -2048,0, 2,0
+    KERNEL1x16_4  -2048,0, 3,0
+    KERNEL1x16_4  -2048,0, 4,0
+    KERNEL1x16_4  -2048,0, 5,0        
+    KERNEL1x16_4  -2048,0, 6,0
+    KERNEL1x16_4  -2048,0, 7,0  
+    KERNEL1x16_4  -2048,0, 8,0      
+    KERNEL1x16_4  -2048,0, 9,0
+    KERNEL1x16_4  -2048,0, 10,0
+    KERNEL1x16_4  -2048,0, 11,0
+    KERNEL1x16_4  -2048,0, 12,0
+    KERNEL1x16_4  -2048,0, 13,0    
+    KERNEL1x16_4  -2048,0, 14,0    
+    KERNEL1x16_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_1x16_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_1x16_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x16_SAVE
+	MY_ALIGN
+LSGEMM_1x16_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x16_SUB2_16 
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,0
+    KERNEL1x16_4  0,0, 2,0
+    KERNEL1x16_4  0,0, 3,0
+    KERNEL1x16_4  0,0, 4,0
+    KERNEL1x16_4  0,0, 5,0
+    KERNEL1x16_4  0,0, 6,0
+    KERNEL1x16_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x16_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x16_SUB2_8 
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,0
+    KERNEL1x16_4  0,0, 2,0
+    KERNEL1x16_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x16_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x16_SUB2_4  
+    KERNEL1x16_4  0,0, 0,0
+    KERNEL1x16_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x16_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x16_SUB2_2 
+    KERNEL1x16_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x16_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x16_SUB2_1 
+    KERNEL1x16_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x16_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x16_SAVE	
+    KERNEL1x16
+
+	MY_ALIGN
+LSGEMM_1x16_SAVE:
+	SAVE1x16
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,16,1
+#endif	
+	addic.		I,	I,	-1
+	bgt+		LSGEMM_1x16_BEGIN
+    MY_ALIGN
+LSGEMM_1x16_END:
+	andi.		I,	M,	8
+	ble		LSGEMM_1x8_END
+
+	MY_ALIGN
+LSGEMM_1x8_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,8,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x8
+	ble		LSGEMM_1x8_SUB0
+    addi AO,AO,2048
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x8_LOOP:
+
+    KERNEL1x8_4  -2048,0, 0,0
+    KERNEL1x8_4  -2048,0, 1,0
+    KERNEL1x8_4  -2048,0, 2,0
+    KERNEL1x8_4  -2048,0, 3,0
+    KERNEL1x8_4  -2048,0, 4,0
+    KERNEL1x8_4  -2048,0, 5,0        
+    KERNEL1x8_4  -2048,0, 6,0
+    KERNEL1x8_4  -2048,0, 7,0  
+    KERNEL1x8_4  -2048,0, 8,0      
+    KERNEL1x8_4  -2048,0, 9,0
+    KERNEL1x8_4  -2048,0, 10,0
+    KERNEL1x8_4  -2048,0, 11,0
+    KERNEL1x8_4  -2048,0, 12,0
+    KERNEL1x8_4  -2048,0, 13,0    
+    KERNEL1x8_4  -2048,0, 14,0    
+    KERNEL1x8_4  -2048,0, 15,1  	
+
+	bdnz		LSGEMM_1x8_LOOP
+    MY_ALIGN
+    addi AO,AO, -2048
+	MY_ALIGN
+LSGEMM_1x8_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x8_SAVE
+	MY_ALIGN
+LSGEMM_1x8_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x8_SUB2_16 
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,0
+    KERNEL1x8_4  0,0, 2,0
+    KERNEL1x8_4  0,0, 3,0
+    KERNEL1x8_4  0,0, 4,0
+    KERNEL1x8_4  0,0, 5,0
+    KERNEL1x8_4  0,0, 6,0
+    KERNEL1x8_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x8_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x8_SUB2_8 
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,0
+    KERNEL1x8_4  0,0, 2,0
+    KERNEL1x8_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x8_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x8_SUB2_4  
+    KERNEL1x8_4  0,0, 0,0
+    KERNEL1x8_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x8_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x8_SUB2_2 
+    KERNEL1x8_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x8_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x8_SUB2_1 
+    KERNEL1x8_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x8_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x8_SAVE	
+    KERNEL1x8
+
+	MY_ALIGN
+LSGEMM_1x8_SAVE:
+	SAVE1x8
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,8,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x8_END:
+	andi.		I,	M,	4
+	ble		LSGEMM_1x4_END
+
+	MY_ALIGN
+LSGEMM_1x4_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,4,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x4
+	ble		LSGEMM_1x4_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x4_LOOP:
+
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,0
+    KERNEL1x4_4  0,0, 4,0
+    KERNEL1x4_4  0,0, 5,0        
+    KERNEL1x4_4  0,0, 6,0
+    KERNEL1x4_4  0,0, 7,0  
+    KERNEL1x4_4  0,0, 8,0      
+    KERNEL1x4_4  0,0, 9,0
+    KERNEL1x4_4  0,0, 10,0
+    KERNEL1x4_4  0,0, 11,0
+    KERNEL1x4_4  0,0, 12,0
+    KERNEL1x4_4  0,0, 13,0    
+    KERNEL1x4_4  0,0, 14,0    
+    KERNEL1x4_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_1x4_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x4_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x4_SAVE
+	MY_ALIGN
+LSGEMM_1x4_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x4_SUB2_16 
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,0
+    KERNEL1x4_4  0,0, 4,0
+    KERNEL1x4_4  0,0, 5,0
+    KERNEL1x4_4  0,0, 6,0
+    KERNEL1x4_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x4_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x4_SUB2_8 
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,0
+    KERNEL1x4_4  0,0, 2,0
+    KERNEL1x4_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x4_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x4_SUB2_4  
+    KERNEL1x4_4  0,0, 0,0
+    KERNEL1x4_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x4_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x4_SUB2_2 
+    KERNEL1x4_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x4_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x4_SUB2_1 
+    KERNEL1x4_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x4_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x4_SAVE	
+    KERNEL1x4
+
+	MY_ALIGN
+LSGEMM_1x4_SAVE:
+	SAVE1x4
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,4,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x4_END:
+	andi.		I,	M,	2
+	ble		LSGEMM_1x2_END
+
+	MY_ALIGN
+LSGEMM_1x2_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,2,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x2
+	ble		LSGEMM_1x2_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x2_LOOP:
+
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,0
+    KERNEL1x2_4  0,0, 4,0
+    KERNEL1x2_4  0,0, 5,0        
+    KERNEL1x2_4  0,0, 6,0
+    KERNEL1x2_4  0,0, 7,0  
+    KERNEL1x2_4  0,0, 8,0      
+    KERNEL1x2_4  0,0, 9,0
+    KERNEL1x2_4  0,0, 10,0
+    KERNEL1x2_4  0,0, 11,0
+    KERNEL1x2_4  0,0, 12,0
+    KERNEL1x2_4  0,0, 13,0    
+    KERNEL1x2_4  0,0, 14,0    
+    KERNEL1x2_4  0,0, 15,1  	
+
+	bdnz		LSGEMM_1x2_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x2_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x2_SAVE
+	MY_ALIGN
+LSGEMM_1x2_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x2_SUB2_16 
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,0
+    KERNEL1x2_4  0,0, 4,0
+    KERNEL1x2_4  0,0, 5,0
+    KERNEL1x2_4  0,0, 6,0
+    KERNEL1x2_4  0,0, 7,1 
+    MY_ALIGN        
+LSGEMM_1x2_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x2_SUB2_8 
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,0
+    KERNEL1x2_4  0,0, 2,0
+    KERNEL1x2_4  0,0, 3,1
+    MY_ALIGN 
+LSGEMM_1x2_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x2_SUB2_4  
+    KERNEL1x2_4  0,0, 0,0
+    KERNEL1x2_4  0,0, 1,1
+	MY_ALIGN	
+LSGEMM_1x2_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x2_SUB2_2 
+    KERNEL1x2_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x2_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x2_SUB2_1 
+    KERNEL1x2_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x2_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x2_SAVE	
+    KERNEL1x2
+
+	MY_ALIGN
+LSGEMM_1x2_SAVE:
+	SAVE1x2
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,2,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x2_END:
+    andi.		I,	M,	1
+	ble		LSGEMM_1x1_END
+
+	MY_ALIGN
+LSGEMM_1x1_BEGIN:
+
+#if defined(TRMMKERNEL)
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else
+	mr		BO,	B
+#endif	
+
+#if defined(TRMMKERNEL)
+   REFRESH_TEMP_BK T11,K,TEMP_REG,1,1 
+   srawi.		L, T11,	6 /**(T11 ) % 64x */
+#else 
+   srawi.		L,	K,	6 /**(K ) % 64x */
+#endif 
+ 
+    ZERO1x1
+	ble		LSGEMM_1x1_SUB0
+ 
+  
+	mtctr		L
+
+	MY_ALIGN
+
+LSGEMM_1x1_LOOP:
+
+    KERNEL1x1_16  0,0, 0,0
+    KERNEL1x1_16  0,0, 1,0
+    KERNEL1x1_16  0,0, 2,0
+    KERNEL1x1_16  0,0, 3,1 	
+
+	bdnz		LSGEMM_1x1_LOOP
+    MY_ALIGN
+ 
+	MY_ALIGN
+LSGEMM_1x1_SUB0: 
+#if defined(TRMMKERNEL)
+	andi.		L,	T11,	63
+#else
+	andi.		L,  K,	63
+#endif	
+	ble		LSGEMM_1x1_SAVE
+	MY_ALIGN
+LSGEMM_1x1_SUB2:
+    andi.      T10,L, 32
+    ble LSGEMM_1x1_SUB2_16 
+    KERNEL1x1_16  0,0, 0,0
+    KERNEL1x1_16  0,0, 1,1 
+    MY_ALIGN        
+LSGEMM_1x1_SUB2_16:
+    andi.      T10,L, 16
+    ble LSGEMM_1x1_SUB2_8 
+    KERNEL1x1_16  0,0, 0,1
+    MY_ALIGN 
+LSGEMM_1x1_SUB2_8:
+    andi.      T10,L, 8
+    ble LSGEMM_1x1_SUB2_4  
+    KERNEL1x1_8  0,0, 0,1
+	MY_ALIGN	
+LSGEMM_1x1_SUB2_4:
+    andi.      T10,L, 4
+    ble LSGEMM_1x1_SUB2_2 
+    KERNEL1x1_4  0,0, 0,1
+    MY_ALIGN
+LSGEMM_1x1_SUB2_2:
+    andi.      T10,L, 2
+    ble LSGEMM_1x1_SUB2_1 
+    KERNEL1x1_2  0,0, 0,1
+    MY_ALIGN    
+LSGEMM_1x1_SUB2_1:
+    andi.      T10,L, 1
+    ble LSGEMM_1x1_SAVE	
+    KERNEL1x1
+
+	MY_ALIGN
+LSGEMM_1x1_SAVE:
+	SAVE1x1
+#if defined(TRMMKERNEL)	
+	REFRESH_AFTER_SAVE T11,K,TEMP_REG,BO,AO,1,1
+#endif	 
+    MY_ALIGN
+LSGEMM_1x1_END:
+	slwi		T1,	K,	2
+	add		B,	B,	T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi TEMP_REG, TEMP_REG, 1
+#endif 
 LSGEMM_END:
\ No newline at end of file
diff --git a/kernel/power/sgemm_macros_power9.S b/kernel/power/sgemm_macros_power9.S
index 2c9e537c7..3750d338d 100644
--- a/kernel/power/sgemm_macros_power9.S
+++ b/kernel/power/sgemm_macros_power9.S
@@ -1,5575 +1,5575 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
- 
-#define unit_size 4
-#define DISP64(ind,disp) (ind*unit_size*64+disp)
-#define DISP32(ind,disp) (ind*unit_size*32+disp)
-#define DISP16(ind,disp) (ind*unit_size*16+disp)
-#define DISP8(ind,disp) (ind*unit_size*8+disp)
-#define DISP4(ind,disp) (ind*unit_size*4+disp)
-#define DISP2(ind,disp) (ind*unit_size*2+disp)
-#define DISP1(ind,disp) (ind*unit_size+disp)
-
-/**********************************************************************************************
-* Macros for N=8 and M=16
-**********************************************************************************************/
-
- 
-
-.macro KERNEL8x16_L1_L4  Index,IsLast
-  KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL8x16_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
- 
-.macro KERNEL8x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro Zero8X16
-    xxlxor		vs32,	vs32,	vs32
-    xxlxor		vs33,	vs33,	vs33
-	xxlxor		vs34,	vs34,	vs34
-	xxlxor		vs35,	vs35,	vs35
-	xxlxor		vs36,	vs36,	vs36
-	xxlxor		vs37,	vs37,	vs37
-	xxlxor		vs38,	vs38,	vs38
-	xxlxor		vs39,	vs39,	vs39
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-	xxlxor		vs44,	vs44,	vs44
-	xxlxor		vs45,	vs45,	vs45
-	xxlxor		vs46,	vs46,	vs46
-	xxlxor		vs47,	vs47,	vs47
-	xxlxor		vs48,	vs48,	vs48
-	xxlxor		vs49,	vs49,	vs49
-	xxlxor		vs50,	vs50,	vs50
-	xxlxor		vs51,	vs51,	vs51 
-	xxlxor		vs52,	vs52,	vs52
-	xxlxor		vs53,	vs53,	vs53
-	xxlxor		vs54,	vs54,	vs54
-	xxlxor		vs55,	vs55,	vs55 
-	xxlxor		vs56,	vs56,	vs56
-	xxlxor		vs57,	vs57,	vs57
-	xxlxor		vs58,	vs58,	vs58
-	xxlxor		vs59,	vs59,	vs59 
-	xxlxor		vs60,	vs60,	vs60
-	xxlxor		vs61,	vs61,	vs61
-	xxlxor		vs62,	vs62,	vs62
-	xxlxor		vs63,	vs63,	vs63	
-.endm
-
-.macro LOAD8x16  OffsetA,OffsetB
-
-	lxv	vs24,	(\OffsetB+0)(BO)
-	lxv	vs28,	(\OffsetB+16)(BO)
-	xxperm  	vs26,	vs24,		permute_mask
-	xxperm  	vs30,	vs28,		permute_mask	  
-	lxv	vs0,	(\OffsetA+0)(AO)
-	lxv	vs1,	(\OffsetA+16)(AO)
-	xxpermdi	vs25,	vs24,	vs24,2	   
-	xxpermdi	vs29,	vs28,	vs28,2	  
-	lxv	vs2,	(\OffsetA+32)(AO)
-	lxv	vs3,	(\OffsetA+48)(AO) 
-	xxpermdi	vs27,	vs26,	vs26,2	
-	xxpermdi	vs31,	vs30,	vs30,2	 	
-
-.endm
-
-.macro END8x16_NORMAL
-  END8x16 0, AO, BO, 64,32 
-.endm
-
-.macro END8x16_WITHOUT_ADD
-	END8x16 0, AO,BO,0,0
-.endm
-
-.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-    xvmulsp     vs34, vs2,vs24  
-    xvmulsp     vs35, vs3,vs24  
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-    xvmulsp     vs38, vs2,vs25  
-    xvmulsp     vs39, vs3,vs25
-
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-    xvmulsp     vs42, vs2,vs26  
-    xvmulsp     vs43, vs3,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
-    xvmulsp     vs46, vs2,vs27  
-    xvmulsp     vs47, vs3,vs27
-
-    xvmulsp     vs48, vs0,vs28
-    xvmulsp     vs49, vs1,vs28
-    xvmulsp     vs50, vs2,vs28  
-    xvmulsp     vs51, vs3,vs28  
-
-    xvmulsp     vs52, vs0,vs29
-    xvmulsp     vs53, vs1,vs29
-    xvmulsp     vs54, vs2,vs29  
-    xvmulsp     vs55, vs3,vs29
-
-    xvmulsp     vs56, vs0,vs30
-    xvmulsp     vs57, vs1,vs30
-    xvmulsp     vs58, vs2,vs30  
-    xvmulsp     vs59, vs3,vs30
-
-    xvmulsp     vs60, vs0,vs31
-    xvmulsp     vs61, vs1,vs31
-    xvmulsp     vs62, vs2,vs31  
-    xvmulsp     vs63, vs3,vs31
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs34, vs2,vs24  
-    xvmaddasp       vs35, vs3,vs24  
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs38, vs2,vs25  
-    xvmaddasp       vs39, vs3,vs25 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs42, vs2,vs26  
-    xvmaddasp       vs43, vs3,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-    xvmaddasp       vs46, vs2,vs27  
-    xvmaddasp       vs47, vs3,vs27
-
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-    xvmaddasp       vs50, vs2,vs28  
-    xvmaddasp       vs51, vs3,vs28  
-
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-    xvmaddasp       vs54, vs2,vs29  
-    xvmaddasp       vs55, vs3,vs29
-
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-    xvmaddasp       vs58, vs2,vs30  
-    xvmaddasp       vs59, vs3,vs30
-
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-    xvmaddasp       vs62, vs2,vs31  
-    xvmaddasp       vs63, vs3,vs31 
-
-.endif
-.endm  
-
-.macro KERNEL8x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-KERNEL8x16_2  \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
-KERNEL8x16_2  \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
-
-.endm
-
-.macro KERNEL8x16 First
-
-  LOAD8x16 0,0
-  END8x16 \First, AO, BO, 64,32 
-.endm
-
-.macro LOAD8x16_2
-    LOAD8x16_2O AO,BO, 0,0
-.endm	
-
-.macro LOAD8x16_2O  AREG,BREG, OffsetA,OffsetB
-  lxv	vs8,	(\OffsetB)(\BREG)
-  lxv	vs12,	(16+\OffsetB)(\BREG)
-  lxv	vs24,	(32+\OffsetB)(\BREG)
-  lxv	vs28,	(32+16+\OffsetB)(\BREG)
-  lxv	vs4,	(0+\OffsetA)(\AREG)
-  lxv	vs5,	(16+\OffsetA)(\AREG)
-  xxperm  	vs10,	vs8,		permute_mask
-  xxperm  	vs14,	vs12,		permute_mask	
-  lxv	vs6,	(32+\OffsetA)(\AREG)
-  lxv	vs7,	(48+\OffsetA)(\AREG) 
-  xxpermdi	vs9,	vs8,	 vs8,2	 
-  xxpermdi	vs13,	vs12,	vs12,2	 
-  lxv	vs0,	(64+\OffsetA)(\AREG)
-  lxv	vs1,	(64+16+\OffsetA)(\AREG) 
-  xxpermdi	vs11,	vs10,	vs10,2	
-  xxpermdi	vs15,	vs14,	vs14,2	
-  lxv	vs2,	(64+32+\OffsetA)(\AREG)
-  lxv	vs3,	(64+48+\OffsetA)(\AREG)
-
-  xxperm  	vs26,	vs24,	permute_mask
-  xxperm  	vs30,	vs28,	permute_mask	
-  xxpermdi	vs25,	vs24,	vs24,2 
-  xxpermdi	vs29,	vs28,	vs28,2	      
-  xxpermdi	vs27,	vs26,	vs26,2	
-  xxpermdi	vs31,	vs30,	vs30,2	 
-.endm
-
-.macro END8x16_2	  
-  /*for load2 offset will be 128 and 64*/
-   KERNEL8x16_2	AO,BO,	128,64,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL8x16_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL8x16_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL8x16_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
-  xvmaddasp		vs32, vs4,vs8
-  xvmaddasp		vs33, vs5,vs8
-  xvmaddasp		vs48, vs4,vs12
-  xvmaddasp		vs49, vs5,vs12
-
-  xvmaddasp		vs40, vs4,vs10
-  xvmaddasp		vs41, vs5,vs10
-  xvmaddasp		vs56, vs4,vs14
-  xvmaddasp		vs57, vs5,vs14
-
-  xvmaddasp		vs36, vs4,vs9
-  xvmaddasp		vs37, vs5,vs9
-  xvmaddasp		vs52, vs4,vs13
-  xvmaddasp		vs53, vs5,vs13
-
-  xvmaddasp		vs44, vs4,vs11
-  xvmaddasp		vs45, vs5,vs11
-  xvmaddasp		vs60, vs4,vs15
-  xvmaddasp		vs61, vs5,vs15
-
-.if \Complete==0	
-   lxv	vs4,	DISP32(\Index,0+\OffsetA)(\AREG)
-   lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
-.endif
-
-  xvmaddasp		vs34, vs6,vs8	
-  xvmaddasp		vs35, vs7,vs8	
-  xvmaddasp		vs50, vs6,vs12
-  xvmaddasp		vs51, vs7,vs12
-.if \Complete==0  
-  lxv vs8,  DISP16(\Index,\OffsetB)(\BREG)
-  lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
-.endif    
-  xvmaddasp		vs42, vs6,vs10
-  xvmaddasp		vs43, vs7,vs10
-  xvmaddasp		vs58, vs6,vs14
-  xvmaddasp		vs59, vs7,vs14
-.if \Complete==0  
-  xxperm    vs10, vs8,    permute_mask
-  xxperm    vs14, vs12,   permute_mask    
-.endif    
-  xvmaddasp		vs38, vs6,vs9	
-  xvmaddasp		vs39, vs7,vs9	
-  xvmaddasp   vs54, vs6,vs13
-  xvmaddasp   vs55, vs7,vs13
-.if \Complete==0
-  xxpermdi  vs9,  vs8,   vs8,2   
-  xxpermdi  vs13, vs12, vs12,2   
-.endif    
-  xvmaddasp		vs46, vs6,vs11
-  xvmaddasp		vs47, vs7,vs11
-  xvmaddasp		vs62, vs6,vs15
-  xvmaddasp		vs63, vs7,vs15
-.if \Complete==0
-  xxpermdi  vs11, vs10, vs10,2  
-  xxpermdi  vs15, vs14, vs14,2  
-.endif  
-
-.if \Complete==0
-   lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
-   lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
-.endif 
-
-  xvmaddasp		vs32, vs0,vs24
-  xvmaddasp		vs33, vs1,vs24
-  xvmaddasp		vs48, vs0,vs28
-  xvmaddasp		vs49, vs1,vs28
-  xvmaddasp		vs40, vs0,vs26
-  xvmaddasp		vs41, vs1,vs26
-  xvmaddasp		vs56, vs0,vs30
-  xvmaddasp		vs57, vs1,vs30
-  xvmaddasp		vs36, vs0,vs25
-  xvmaddasp		vs37, vs1,vs25
-  xvmaddasp		vs52, vs0,vs29
-  xvmaddasp		vs53, vs1,vs29
-  xvmaddasp		vs44, vs0,vs27
-  xvmaddasp		vs45, vs1,vs27
-  xvmaddasp		vs60, vs0,vs31
-  xvmaddasp		vs61, vs1,vs31 
-.if \Complete==0
-  lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
-  lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG) 
-.endif
-
-  xvmaddasp		vs34, vs2,vs24
-  xvmaddasp		vs35, vs3,vs24	  
-  xvmaddasp		vs50, vs2,vs28
-  xvmaddasp		vs51, vs3,vs28
-.if \Complete==0
-  lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
-  lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
-.endif  
-  xvmaddasp		vs42, vs2,vs26
-  xvmaddasp		vs43, vs3,vs26
-  xvmaddasp		vs58, vs2,vs30
-  xvmaddasp		vs59, vs3,vs30
-.if \Complete==0
-  xxperm    vs26, vs24, permute_mask
-  xxperm    vs30, vs28, permute_mask  
-.endif  
-  xvmaddasp		vs38, vs2,vs25
-  xvmaddasp		vs39, vs3,vs25
-  xvmaddasp		vs54, vs2,vs29
-  xvmaddasp		vs55, vs3,vs29
-.if \Complete==0
-  xxpermdi  vs25, vs24, vs24,2 
-  xxpermdi  vs29, vs28, vs28,2    
-.endif  
-  xvmaddasp		vs46, vs2,vs27
-  xvmaddasp		vs47, vs3,vs27
-  xvmaddasp		vs62, vs2,vs31	
-  xvmaddasp		vs63, vs3,vs31
-.if \Complete==0
-  xxpermdi  vs27, vs26, vs26,2  
-  xxpermdi  vs31, vs30, vs30,2   
-.endif
-.if \Complete==0
-  lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
-  lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
-.endif
-
-
-.if \IsLast==1	
-.if \Complete==1
-	addi		\BREG, \BREG,  DISP16(\Index,\OffsetB)
-  addi    \AREG, \AREG, DISP32(\Index,\OffsetA)  
-
-.else
-	addi		\BREG, \BREG,  DISP16(\Index,64)
-  addi    \AREG, \AREG, DISP32(\Index,128)  
-
-.endif
-.endif   
-
-
-.endm
-
- 
-.macro SAVE8x16
-
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-
-  add     T2, CO, T10  
-  add     T3, T1, T10  
-
-  add     T4, T2, T10  
-  add     T5, T3, T10 
-
-  add     T6, T4, T10 
-  add     T7, T5, T10 
-
-
-
-   /* permute to restore butterfly rank 1 updateto normal promoted one */  
-    /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC)  vs11 MEM(CO+3*LDC) */
-    /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC)  vs15 MEM(16+CO+3*LDC) */
-    /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC)  vs19 MEM(32+CO+3*LDC) */
-    /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC)  vs27 MEM(32+CO+3*LDC) */
-
-    xxmrglw     vs8,    vs32,   vs44
-    xxmrglw     vs10,   vs36,   vs40  
-
-    xxmrghw     vs1,    vs32,   vs44
-    xxmrghw     vs0,    vs36,   vs40
-
-    xxmrglw     vs12,   vs33,   vs45
-    xxmrglw     vs14,   vs37,   vs41  
-
-    xxmrghw     vs2,    vs37,   vs41
-    xxmrghw     vs3,    vs33,   vs45
-#ifndef TRMMKERNEL    
-    lxv        vs32, 0(CO)
-    lxv        vs33, 16(CO) 
-#endif 
-    xxmrglw     vs16,   vs34,   vs46
-    xxmrglw     vs18,   vs38,   vs42   
-
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10 
-
-    xxmrghw     vs4,    vs38,   vs42
-    xxmrghw     vs5,    vs34,   vs46
-
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
-
-    xxmrglw     vs24,   vs35,   vs47
-    xxmrglw     vs26,   vs39,   vs43  
-
-    xxlor      vs17,    vs16,   vs16
-    xxlor      vs19,    vs18,   vs18
-
-    xxmrghw     vs30,   vs39,   vs43 
-    xxmrghw     vs31,   vs35,   vs47
-#ifndef TRMMKERNEL       
-    lxv        vs34, 32(CO)  
-    lxv        vs35, 48(CO)      
-#endif
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-#ifndef TRMMKERNEL    
-    lxv        vs36, 0(T1)
-    lxv        vs37, 16(T1) 
-#endif
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-#ifndef TRMMKERNEL      
-    lxv        vs38, 32(T1)  
-    lxv        vs39, 48(T1)     
-#endif
-
-    xxlor      vs25,    vs24,   vs24
-    xxlor      vs27,    vs26,   vs26 
-
-
-
-#ifndef TRMMKERNEL       
-    lxv        vs40, 0(T2)
-    lxv        vs41, 16(T2) 
-#endif
-
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-#ifndef TRMMKERNEL     
-    lxv        vs42, 32(T2)  
-    lxv        vs43, 48(T2)     
-#endif  
-       
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2  
-#ifndef TRMMKERNEL    
-    lxv        vs44, 0(T3)
-    lxv        vs45, 16(T3)
-#endif
-    xxperm     vs16,    vs4,    save_permute_1
-    xxperm     vs18,    vs5,    save_permute_1
-#ifndef TRMMKERNEL      
-    lxv        vs46, 32(T3)  
-    lxv        vs47, 48(T3)                 
-#endif  
-
-    
-
-
-      
-    xxperm     vs17,    vs4,    save_permute_2   
-    xxperm     vs19,    vs5,    save_permute_2      
-#ifdef TRMMKERNEL
-    xvmulsp     vs32,   vs8,    alpha_r 
-    xvmulsp     vs33,   vs12,   alpha_r                 
-#else 
-    xvmaddasp   vs32,   vs8,    alpha_r 
-    xvmaddasp   vs33,   vs12,   alpha_r            
-#endif 
-    xxperm     vs24,    vs30,   save_permute_1
-    xxperm     vs26,    vs31,   save_permute_1 
-
- 
-    stxv        vs32, 0(CO)
-    stxv        vs33, 16(CO)     
-#ifdef TRMMKERNEL   
-    xvmulsp     vs34,   vs16,   alpha_r 
-    xvmulsp     vs35,   vs24,   alpha_r                 
-#else    
-    xvmaddasp   vs34,   vs16,   alpha_r 
-    xvmaddasp   vs35,   vs24,   alpha_r           
-#endif 
-         
-    xxperm     vs25,    vs30,   save_permute_2   
-    xxperm     vs27,    vs31,   save_permute_2  
-
-
-    stxv        vs34, 32(CO)  
-    stxv        vs35, 48(CO)  
-#ifdef TRMMKERNEL  
-    xvmulsp     vs36,   vs9,    alpha_r 
-    xvmulsp     vs37,   vs13,   alpha_r                
-#else   
-    xvmaddasp   vs36,   vs9,    alpha_r 
-    xvmaddasp   vs37,   vs13,   alpha_r           
-#endif 
-    stxv        vs36, 0(T1)
-    stxv        vs37, 16(T1)
-#ifdef TRMMKERNEL  
-    xvmulsp     vs38,   vs17,   alpha_r 
-    xvmulsp     vs39,   vs25,   alpha_r               
-#else   
-    xvmaddasp   vs38,   vs17,   alpha_r 
-    xvmaddasp   vs39,   vs25,   alpha_r         
-#endif 
-    stxv        vs38, 32(T1)  
-    stxv        vs39, 48(T1)
-
-#ifdef TRMMKERNEL
-    xvmulsp     vs40,   vs10,   alpha_r 
-    xvmulsp     vs41,   vs14,   alpha_r                    
-#else 
-    xvmaddasp   vs40,   vs10,   alpha_r 
-    xvmaddasp   vs41,   vs14,   alpha_r   
-#endif   
-
-    stxv        vs40, 0(T2)
-    stxv        vs41, 16(T2)  
-#ifdef TRMMKERNEL 
-    xvmulsp     vs42,   vs18,   alpha_r 
-    xvmulsp     vs43,   vs26,   alpha_r                     
-#else   
-    xvmaddasp   vs42,   vs18,   alpha_r 
-    xvmaddasp   vs43,   vs26,   alpha_r
-#endif      
-    stxv        vs42, 32(T2)  
-    stxv        vs43, 48(T2)  
-#ifdef TRMMKERNEL  
-    xvmulsp     vs44,   vs11,   alpha_r 
-    xvmulsp     vs45,   vs15,   alpha_r                    
-#else
-    xvmaddasp   vs44,   vs11,   alpha_r 
-    xvmaddasp   vs45,   vs15,   alpha_r    
-#endif      
-    stxv        vs44, 0(T3)
-    stxv        vs45, 16(T3) 
-#ifdef TRMMKERNEL 
-    xvmulsp     vs46,   vs19,   alpha_r 
-    xvmulsp     vs47,   vs27,   alpha_r                   
-#else 
-    xvmaddasp   vs46,   vs19,   alpha_r 
-    xvmaddasp   vs47,   vs27,   alpha_r 
-#endif      
-    stxv        vs46, 32(T3)  
-    stxv        vs47, 48(T3)
-  
- /*****the same with the second 8X8 ****/
- #ifndef TRMMKERNEL 
-    lxv        vs32, 0(T4)
-    lxv        vs33, 16(T4) 
-#endif  
-    xxmrglw     vs8,    vs48,   vs60
-    xxmrglw     vs10,   vs52,   vs56  
-#ifndef TRMMKERNEL    
-    lxv        vs34, 32(T4)  
-    lxv        vs35, 48(T4)  
-#endif  
-    xxmrghw     vs1,    vs48,   vs60
-    xxmrghw     vs0,    vs52,   vs56
-#ifndef TRMMKERNEL        
-    lxv        vs36, 0(T5)
-    lxv        vs37, 16(T5) 
-#endif  
-    xxmrglw     vs12,   vs49,   vs61
-    xxmrglw     vs14,   vs53,   vs57  
-#ifndef TRMMKERNEL    
-    lxv        vs38,32(T5)  
-    lxv        vs39, 48(T5)     
-#endif   
- 
-    xxmrghw     vs2,    vs53,   vs57
-    xxmrghw     vs3,    vs49,   vs61
-#ifndef TRMMKERNEL   
-    lxv        vs40, 0(T6)
-    lxv        vs41, 16(T6)
-#endif  
-    xxmrglw     vs16,   vs50,   vs62
-    xxmrglw     vs18,   vs54,   vs58   
-#ifndef TRMMKERNEL      
-    lxv        vs42, 32(T6)  
-    lxv        vs43, 48(T6) 
-#endif  
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10 
-    xxmrghw     vs4,    vs54,   vs58
-    xxmrghw     vs5,    vs50,   vs62
-#ifndef TRMMKERNEL              
-    lxv        vs44, 0(T7)
-    lxv        vs45, 16(T7) 
-#endif  
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
- 
-    xxmrglw     vs24,   vs51,   vs63
-    xxmrglw     vs26,   vs55,   vs59 
-#ifndef TRMMKERNEL    
-    lxv        vs46, 32(T7)  
-    lxv        vs47, 48(T7)     
-#endif  
-    xxlor      vs17,    vs16,   vs16
-    xxlor      vs19,    vs18,   vs18
-    xxmrghw     vs30,   vs55,   vs59 
-    xxmrghw     vs31,   vs51,   vs63 
-
- 
-
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-     
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-    xxlor      vs25,    vs24,   vs24
-    xxlor      vs27,    vs26,   vs26 
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
- #ifdef TRMMKERNEL
-    xvmulsp     vs32,   vs8,    alpha_r 
-    xvmulsp     vs33,   vs12,   alpha_r                 
-#else 
-    xvmaddasp   vs32,   vs8,    alpha_r 
-    xvmaddasp   vs33,   vs12,   alpha_r            
-#endif  
-    xxperm     vs16,    vs4,    save_permute_1
-    xxperm     vs18,    vs5,    save_permute_1
-    stxv        vs32, 0(T4)
-    stxv        vs33, 16(T4) 
-    xxperm     vs17,    vs4,    save_permute_2   
-    xxperm     vs19,    vs5,    save_permute_2      
-    xxperm     vs24,    vs30,   save_permute_1
-    xxperm     vs26,    vs31,   save_permute_1 
-    xxperm     vs25,    vs30,   save_permute_2   
-    xxperm     vs27,    vs31,   save_permute_2      
-
-#ifdef TRMMKERNEL   
-    xvmulsp     vs34,   vs16,   alpha_r 
-    xvmulsp     vs35,   vs24,   alpha_r                 
-#else    
-    xvmaddasp   vs34,   vs16,   alpha_r 
-    xvmaddasp   vs35,   vs24,   alpha_r           
-#endif 
-    stxv        vs34, 32(T4)  
-    stxv        vs35, 48(T4)  
-
-#ifdef TRMMKERNEL  
-    xvmulsp     vs36,   vs9,    alpha_r 
-    xvmulsp     vs37,   vs13,   alpha_r                
-#else   
-    xvmaddasp   vs36,   vs9,    alpha_r 
-    xvmaddasp   vs37,   vs13,   alpha_r           
-#endif 
-    stxv        vs36, 0(T5)
-    stxv        vs37, 16(T5) 
-
-#ifdef TRMMKERNEL  
-    xvmulsp     vs38,   vs17,   alpha_r 
-    xvmulsp     vs39,   vs25,   alpha_r               
-#else  
-    xvmaddasp   vs38,   vs17,   alpha_r 
-    xvmaddasp   vs39,   vs25,   alpha_r         
-#endif 
-
-
-
- 
-    stxv        vs38, 32(T5)  
-    stxv        vs39, 48(T5)
-
-
-#ifdef TRMMKERNEL
-    xvmulsp     vs40,   vs10,   alpha_r 
-    xvmulsp     vs41,   vs14,   alpha_r                    
-#else 
-    xvmaddasp   vs40,   vs10,   alpha_r 
-    xvmaddasp   vs41,   vs14,   alpha_r   
-#endif  
-    stxv        vs40, 0(T6)
-    stxv        vs41, 16(T6) 
-#ifdef TRMMKERNEL 
-    xvmulsp     vs42,   vs18,   alpha_r 
-    xvmulsp     vs43,   vs26,   alpha_r                     
-#else   
-    xvmaddasp   vs42,   vs18,   alpha_r 
-    xvmaddasp   vs43,   vs26,   alpha_r
-#endif  
-    stxv        vs42, 32(T6)  
-    stxv        vs43, 48(T6)  
-#ifdef TRMMKERNEL  
-    xvmulsp     vs44,   vs11,   alpha_r 
-    xvmulsp     vs45,   vs15,   alpha_r                    
-#else
-    xvmaddasp   vs44,   vs11,   alpha_r 
-    xvmaddasp   vs45,   vs15,   alpha_r    
-#endif  
-
-    stxv        vs44, 0(T7)
-    stxv        vs45, 16(T7) 
-#ifdef TRMMKERNEL 
-    xvmulsp     vs46,   vs19,   alpha_r 
-    xvmulsp     vs47,   vs27,   alpha_r                   
-#else 
-    xvmaddasp   vs46,   vs19,   alpha_r 
-    xvmaddasp   vs47,   vs27,   alpha_r 
-#endif  
- 
-    stxv        vs46, 32(T7)  
-    stxv        vs47, 48(T7)
-  
-
-    addi CO,CO,64
-
-
-.endm
-
-
-
-/**********************************************************************************************
-* Macros for N=8 and M=8
-**********************************************************************************************/
-
-.macro LOAD8x8_1
-   LOAD8x8 1
-.endm
-
-.macro LOAD8x8_0
-   LOAD8x8 0
-.endm
-
-.macro KERNEL8x8_L1_L4  Index,IsLast
-  KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL8x8_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-.macro KERNEL8x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL8x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL8x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro END8x8_NORMAL
-  END8x8 0, AO, BO, 32,32 
-.endm
-
-.macro Zero8X8
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
- 
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
- 
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41
- 
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45
- 
-    xxlxor      vs48,   vs48,   vs48
-    xxlxor      vs49,   vs49,   vs49
- 
-    xxlxor      vs52,   vs52,   vs52
-    xxlxor      vs53,   vs53,   vs53
- 
-    xxlxor      vs56,   vs56,   vs56
-    xxlxor      vs57,   vs57,   vs57
-  
-    xxlxor      vs60,   vs60,   vs60
-    xxlxor      vs61,   vs61,   vs61
-    
-.endm
-
-.macro LOAD8x8  Zero
-
-    lxv vs24,   0(BO)
-    lxv vs28,   16(BO)
-    lxv vs0,     0(AO)
-    lxv vs1,    16(AO)
-
-    xxperm      vs26,   vs24,       permute_mask
-    xxperm      vs30,   vs28,       permute_mask    
-    xxpermdi    vs25,   vs24,   vs24,2     
-    xxpermdi    vs29,   vs28,   vs28,2    
-
-    xxpermdi    vs27,   vs26,   vs26,2  
-    xxpermdi    vs31,   vs30,   vs30,2      
-
-.if \Zero==1 
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41 
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45 
-    xxlxor      vs48,   vs48,   vs48
-    xxlxor      vs49,   vs49,   vs49 
-    xxlxor      vs52,   vs52,   vs52
-    xxlxor      vs53,   vs53,   vs53 
-    xxlxor      vs56,   vs56,   vs56
-    xxlxor      vs57,   vs57,   vs57  
-    xxlxor      vs60,   vs60,   vs60
-    xxlxor      vs61,   vs61,   vs61  
-.endif
-.endm
-
-
-.macro END8x8 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
-
-    xvmulsp     vs48, vs0,vs28
-    xvmulsp     vs49, vs1,vs28
-
-    xvmulsp     vs52, vs0,vs29
-    xvmulsp     vs53, vs1,vs29
-
-    xvmulsp     vs56, vs0,vs30
-    xvmulsp     vs57, vs1,vs30
-
-    xvmulsp     vs60, vs0,vs31
-    xvmulsp     vs61, vs1,vs31
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-
-.endif
-.endm  
-
-.macro KERNEL8x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs8,    DISP32(\Index, 0+\OffsetB)(\BREG)
-    lxv vs12,   DISP32(\Index,16+\OffsetB)(\BREG)
-
-    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
-
-    xxperm      vs10,   vs8,        permute_mask
-    xxperm      vs14,   vs12,       permute_mask    
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xxpermdi    vs9,    vs8,    vs8,2    
-    xxpermdi    vs13,   vs12,   vs12,2   
-
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xxpermdi    vs11,   vs10,   vs10,2  
-    xxpermdi    vs15,   vs14,   vs14,2  
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-    lxv vs24,   DISP32(\Index,32+\OffsetB)(\BREG)
-    lxv vs28,   DISP32(\Index,32+16+\OffsetB)(\BREG)
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-
-    xxperm      vs26,   vs24,       permute_mask
-    xxperm      vs30,   vs28,       permute_mask    
-
-    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
-    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
-
-
-    xxpermdi    vs25,   vs24,   vs24,2     
-    xxpermdi    vs29,   vs28,   vs28,2    
-
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-    xxpermdi    vs27,   vs26,   vs26,2  
-    xxpermdi    vs31,   vs30,   vs30,2      
-
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11
-
-    xvmaddasp       vs48, vs4,vs12
-    xvmaddasp       vs49, vs5,vs12
-
-    xvmaddasp       vs52, vs4,vs13
-    xvmaddasp       vs53, vs5,vs13
-    lxv vs8,    DISP32(\Index,64+\OffsetB)(\BREG)
-    lxv vs12,   DISP32(\Index,64+16+\OffsetB)(\BREG)
-    xvmaddasp       vs56, vs4,vs14
-    xvmaddasp       vs57, vs5,vs14
-
-    xvmaddasp       vs60, vs4,vs15
-    xvmaddasp       vs61, vs5,vs15
-
-    xxperm      vs10,   vs8,        permute_mask
-    xxperm      vs14,   vs12,       permute_mask   
- 
-
-    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
-    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
-
- 
-    xxpermdi    vs9,    vs8,    vs8,2    
-    xxpermdi    vs13,   vs12,   vs12,2  
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xxpermdi    vs11,   vs10,   vs10,2  
-    xxpermdi    vs15,   vs14,   vs14,2  
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-.if \Complete==0
-    lxv vs24,   DISP32(\Index,96+\OffsetB)(\BREG)
-    lxv vs28,   DISP32(\Index,96+16+\OffsetB)(\BREG)
-.endif 
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-.if \Complete==0
-    xxperm      vs26,   vs24,   permute_mask
-    xxperm      vs30,   vs28,   permute_mask   
-.endif 
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-
-
-.if \Complete==0
-    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
-    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
-.endif 
-
-.if \Complete==0     
-    xxpermdi    vs25,   vs24,   vs24,2 
-    xxpermdi    vs29,   vs28,   vs28,2      
-
-.endif 
-.if \IsLast==1  
-.if \Complete==1
-  
-    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
-    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
-.else
-  
-    addi        \BREG, \BREG,  DISP32(\Index,128)
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif
-.endif   
- 
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-.if \Complete==0        
-    xxpermdi    vs27,   vs26,   vs26,2  
-    xxpermdi    vs31,   vs30,   vs30,2  
-    
-.endif
- 
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11
-
-    xvmaddasp       vs48, vs4,vs12
-    xvmaddasp       vs49, vs5,vs12
-
-    xvmaddasp       vs52, vs4,vs13
-    xvmaddasp       vs53, vs5,vs13
-
-    xvmaddasp       vs56, vs4,vs14
-    xvmaddasp       vs57, vs5,vs14
-
-    xvmaddasp       vs60, vs4,vs15
-    xvmaddasp       vs61, vs5,vs15
-
-.endm
-
-.macro KERNEL8x8 First
-
-  LOAD8x8 0
-  END8x8 \First, AO, BO, 32,32  
-.endm
-
-.macro KERNEL8x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-    
-    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG)
-    lxv vs12,   DISP16(\Index,16+\OffsetB)(\BREG)
-
-    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
-
-    xxperm      vs10,   vs8,        permute_mask
-    xxperm      vs14,   vs12,       permute_mask    
-    xxpermdi    vs9,    vs8,    vs8,2    
-    xxpermdi    vs13,   vs12,   vs12,2   
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-.endif
-
-    xxpermdi    vs11,   vs10,   vs10,2  
-    xxpermdi    vs15,   vs14,   vs14,2  
- 
-.if \First==1  
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
-
-    xvmulsp     vs48, vs0,vs28
-    xvmulsp     vs49, vs1,vs28
-
-    xvmulsp     vs52, vs0,vs29
-    xvmulsp     vs53, vs1,vs29
-
-    xvmulsp     vs56, vs0,vs30
-    xvmulsp     vs57, vs1,vs30
-
-    xvmulsp     vs60, vs0,vs31
-    xvmulsp     vs61, vs1,vs31
-
-.else 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
-    xvmaddasp       vs48, vs0,vs28
-    xvmaddasp       vs49, vs1,vs28
-
-    xvmaddasp       vs52, vs0,vs29
-    xvmaddasp       vs53, vs1,vs29
-
-    xvmaddasp       vs56, vs0,vs30
-    xvmaddasp       vs57, vs1,vs30
-
-    xvmaddasp       vs60, vs0,vs31
-    xvmaddasp       vs61, vs1,vs31
-
-.endif
-.if \Complete==0
-    lxv vs24,   DISP16(\Index,32+\OffsetB)(\BREG)
-    lxv vs28,   DISP16(\Index,32+16+\OffsetB)(\BREG)
-
-    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
-    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
-
-    xxperm      vs26,   vs24,   permute_mask
-    xxperm      vs30,   vs28,   permute_mask    
-    xxpermdi    vs25,   vs24,   vs24,2   
-    xxpermdi    vs29,   vs28,   vs28,2  
-.endif    
-.if \IsLast==1  
-.if \Complete==1
-    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB) 
-    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
-
-.else
-    addi        \BREG, \BREG,  DISP16(\Index,64)
-    addi        \AREG, \AREG,  DISP16(\Index,64) 
-.endif
-.endif
-
-.if \First==1
-    xvmulsp     vs32, vs4,vs8
-    xvmulsp     vs33, vs5,vs8
-
-    xvmulsp     vs36, vs4,vs9
-    xvmulsp     vs37, vs5,vs9
-
-.else
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-.endif 
- 
-.if \Complete==0        
-    xxpermdi    vs27,   vs26,   vs26,2  
-    xxpermdi    vs31,   vs30,   vs30,2  
- 
-.endif
-.if \First==1  
-    xvmulsp     vs40, vs4,vs10
-    xvmulsp     vs41, vs5,vs10
-
-    xvmulsp     vs44, vs4,vs11
-    xvmulsp     vs45, vs5,vs11
-
-    xvmulsp     vs48, vs4,vs12
-    xvmulsp     vs49, vs5,vs12
-
-    xvmulsp     vs52, vs4,vs13
-    xvmulsp     vs53, vs5,vs13
-
-    xvmulsp     vs56, vs4,vs14
-    xvmulsp     vs57, vs5,vs14
-
-    xvmulsp     vs60, vs4,vs15
-    xvmulsp     vs61, vs5,vs15
-
-.else 
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11
-
-    xvmaddasp       vs48, vs4,vs12
-    xvmaddasp       vs49, vs5,vs12
-
-    xvmaddasp       vs52, vs4,vs13
-    xvmaddasp       vs53, vs5,vs13
-
-    xvmaddasp       vs56, vs4,vs14
-    xvmaddasp       vs57, vs5,vs14
-
-    xvmaddasp       vs60, vs4,vs15
-    xvmaddasp       vs61, vs5,vs15
-
-.endif
-
-.endm
-
-
-.macro SAVE8x8 
- 
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-
-  add     T2, CO, T10  
-  add     T3, T1, T10  
-
-  add     T4, T2, T10  
-  add     T5, T3, T10 
-
-  add     T6, T4, T10 
-  add     T7, T5, T10 
-
-#ifndef TRMMKERNEL    
-    lxv        vs34, 0(CO)
-    lxv        vs35, 16(CO)      
-    lxv        vs38, 0(T1)
-    lxv        vs39, 16(T1)  
-    lxv        vs42, 0(T2)
-    lxv        vs43, 16(T2)     
-    lxv        vs46, 0(T3)
-    lxv        vs47, 16(T3)  
-
-    lxv        vs50, 0(T4)
-    lxv        vs51, 16(T4)      
-    lxv        vs54, 0(T5)
-    lxv        vs55, 16(T5)  
-    lxv        vs58, 0(T6)
-    lxv        vs59, 16(T6)     
-    lxv        vs62, 0(T7)
-    lxv        vs63, 16(T7) 
-#endif  
-
-    xxmrglw     vs8,    vs32,   vs44
-    xxmrglw     vs10,   vs36,   vs40  
-
-    xxmrghw     vs1,    vs32,   vs44
-    xxmrghw     vs0,    vs36,   vs40
-
-    xxmrglw     vs12,   vs33,   vs45
-    xxmrglw     vs14,   vs37,   vs41  
-
-    xxmrghw     vs2,    vs37,   vs41
-    xxmrghw     vs3,    vs33,   vs45
-
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10 
- 
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
-
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-      
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
-
-
-    /* multiply add normal way */
- 
-#ifdef TRMMKERNEL
-    xvmulsp     vs34,   vs8,    alpha_r 
-    xvmulsp     vs35,   vs12,   alpha_r 
-    xvmulsp     vs38,   vs9,    alpha_r 
-    xvmulsp     vs39,   vs13,   alpha_r 
-    xvmulsp     vs42,   vs10,   alpha_r 
-    xvmulsp     vs43,   vs14,   alpha_r 
-    xvmulsp     vs46,   vs11,   alpha_r 
-    xvmulsp     vs47,   vs15,   alpha_r                    
-#else 
-    xvmaddasp   vs34,   vs8,    alpha_r 
-    xvmaddasp   vs35,   vs12,   alpha_r 
-    xvmaddasp   vs38,   vs9,    alpha_r 
-    xvmaddasp   vs39,   vs13,   alpha_r  
-    xvmaddasp   vs42,   vs10,   alpha_r 
-    xvmaddasp   vs43,   vs14,   alpha_r   
-    xvmaddasp   vs46,   vs11,   alpha_r 
-    xvmaddasp   vs47,   vs15,   alpha_r                     
-#endif     
- 
-   
-    xxmrglw     vs8,    vs48,   vs60
-    xxmrglw     vs10,   vs52,   vs56  
-
-    xxmrghw     vs1,    vs48,   vs60
-    xxmrghw     vs0,    vs52,   vs56
-    stxv        vs34, 0(CO)
-    stxv        vs35, 16(CO) 
-    xxmrglw     vs12,   vs49,   vs61
-    xxmrglw     vs14,   vs53,   vs57  
-    stxv        vs38, 0(T1)
-    stxv        vs39, 16(T1) 
-    xxmrghw     vs2,    vs53,   vs57
-    xxmrghw     vs3,    vs49,   vs61
-    stxv        vs42, 0(T2)
-    stxv        vs43, 16(T2)   
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10  
-    stxv        vs46, 0(T3)
-    stxv        vs47, 16(T3)  
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
-   
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-    
- 
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
- 
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
-    
- #ifdef TRMMKERNEL
-    xvmulsp     vs50,   vs8,    alpha_r 
-    xvmulsp     vs51,   vs12,   alpha_r 
-    xvmulsp     vs54,   vs9,    alpha_r 
-    xvmulsp     vs55,   vs13,   alpha_r 
-    xvmulsp     vs58,   vs10,   alpha_r 
-    xvmulsp     vs59,   vs14,   alpha_r 
-    xvmulsp     vs62,   vs11,   alpha_r 
-    xvmulsp     vs63,   vs15,   alpha_r                    
-#else 
-    xvmaddasp     vs50,   vs8,    alpha_r 
-    xvmaddasp     vs51,   vs12,   alpha_r 
-    xvmaddasp     vs54,   vs9,    alpha_r 
-    xvmaddasp     vs55,   vs13,   alpha_r 
-    xvmaddasp     vs58,   vs10,   alpha_r 
-    xvmaddasp     vs59,   vs14,   alpha_r 
-    xvmaddasp     vs62,   vs11,   alpha_r 
-    xvmaddasp     vs63,   vs15,   alpha_r                     
-#endif  
-
-    stxv        vs50, 0(T4)
-    stxv        vs51, 16(T4)      
-    stxv        vs54, 0(T5)
-    stxv        vs55, 16(T5)  
-    stxv        vs58, 0(T6)
-    stxv        vs59, 16(T6)     
-    stxv        vs62, 0(T7)
-    stxv        vs63, 16(T7)   
-
-    addi CO,CO,32
-
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=8 and M=4
-**********************************************************************************************/
-
-.macro LOAD8x4_1
-   LOAD8x4 1
-.endm
-
-.macro LOAD8x4_0
-   LOAD8x4 0
-.endm
-
-.macro KERNEL8x4_L1_L4  Index,IsLast
-  KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL8x4_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-.macro KERNEL8x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL8x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL8x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL8x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL8x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro Zero8X4
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
-    
-    xxlxor      vs48,   vs48,   vs48
-    xxlxor      vs49,   vs49,   vs49
-    xxlxor      vs50,   vs50,   vs50
-    xxlxor      vs51,   vs51,   vs51  
-    
-.endm
-
-.macro LOAD8x4  Zero
-
-    lxv vs0,     0(AO)
-    lxv vs24,   0(BO)
-    lxv vs25,   16(BO)
-
-
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2        
-
-.if \Zero==1 
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
-
-    xxlxor      vs48,   vs48,   vs48
-    xxlxor      vs49,   vs49,   vs49
-    xxlxor      vs50,   vs50,   vs50
-    xxlxor      vs51,   vs51,   vs51  
-.endif
-.endm
-
-.macro END8x4_NORMAL
-  END8x4 0, AO, BO, 16,32 
-.endm
-
-.macro END8x4 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp      vs32,   vs24,   vs0
-    xvmulsp      vs33,   vs24,   vs1 
-    xvmulsp      vs34,   vs24,   vs2
-    xvmulsp      vs35,   vs24,   vs3
-
-    xvmulsp      vs48,   vs25,   vs0
-    xvmulsp      vs49,   vs25,   vs1
-    xvmulsp      vs50,   vs25,   vs2
-    xvmulsp      vs51,   vs25,   vs3  
-.else
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
-
-    xvmaddasp      vs48,   vs25,   vs0
-    xvmaddasp      vs49,   vs25,   vs1
-    xvmaddasp      vs50,   vs25,   vs2
-    xvmaddasp      vs51,   vs25,   vs3 
-
-.endif
-.endm  
-
-.macro KERNEL8x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
- 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
-
-    xvmaddasp      vs48,   vs25,   vs0
-    xvmaddasp      vs49,   vs25,   vs1
-    xvmaddasp      vs50,   vs25,   vs2
-    xvmaddasp      vs51,   vs25,   vs3 
-
-    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
-    lxv vs24,   DISP32(\Index, 32+\OffsetB)(\BREG)
-    lxv vs25,   DISP32(\Index, 48+\OffsetB)(\BREG) 
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
-
-    xvmaddasp      vs48,   vs27,   vs4
-    xvmaddasp      vs49,   vs27,   vs5
-    xvmaddasp      vs50,   vs27,   vs6
-    xvmaddasp      vs51,   vs27,   vs7
- 
-
-    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
-    lxv vs26,   DISP32(\Index, 64+\OffsetB)(\BREG)
-    lxv vs27,   DISP32(\Index, 80+\OffsetB)(\BREG)
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
- 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
-
-    xvmaddasp      vs48,   vs25,   vs0
-    xvmaddasp      vs49,   vs25,   vs1
-    xvmaddasp      vs50,   vs25,   vs2
-    xvmaddasp      vs51,   vs25,   vs3 
-
-.if \Complete==0 
-
-    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
-    lxv vs24,   DISP32(\Index, 96+\OffsetB)(\BREG)
-    lxv vs25,   DISP32(\Index, 96+16+\OffsetB)(\BREG) 
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-.endif
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
-
-    xvmaddasp      vs48,   vs27,   vs4
-    xvmaddasp      vs49,   vs27,   vs5
-    xvmaddasp      vs50,   vs27,   vs6
-    xvmaddasp      vs51,   vs27,   vs7
-
- 
- 
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
-    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
-
-.else
-    addi        \AREG, \AREG, DISP16(\Index,64)  
-    addi        \BREG, \BREG,  DISP32(\Index,128)
-
-.endif
-.endif   
- 
- 
-.endm
-
-.macro KERNEL8x4 First
-    LOAD8x4 0
-    END8x4 \First, AO, BO, 16,32  
-.endm
-
-.macro KERNEL8x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
-.if \First==1
-    xvmulsp      vs32,   vs24,   vs0
-    xvmulsp      vs33,   vs24,   vs1 
-    xvmulsp      vs34,   vs24,   vs2
-    xvmulsp      vs35,   vs24,   vs3
-
-    xvmulsp      vs48,   vs25,   vs0
-    xvmulsp      vs49,   vs25,   vs1
-    xvmulsp      vs50,   vs25,   vs2
-    xvmulsp      vs51,   vs25,   vs3  
-.else 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
-
-    xvmaddasp      vs48,   vs25,   vs0
-    xvmaddasp      vs49,   vs25,   vs1
-    xvmaddasp      vs50,   vs25,   vs2
-    xvmaddasp      vs51,   vs25,   vs3 
-.endif
-
-.if \Complete==0 
-
-    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
-    lxv vs24,   DISP16(\Index, 32+\OffsetB)(\BREG)
-    lxv vs25,   DISP16(\Index, 48+\OffsetB)(\BREG) 
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-.endif
-
-.if \First==1
-    xvmulsp      vs32,   vs26,   vs4
-    xvmulsp      vs33,   vs26,   vs5 
-    xvmulsp      vs34,   vs26,   vs6
-    xvmulsp      vs35,   vs26,   vs7
-
-    xvmulsp      vs48,   vs27,   vs4
-    xvmulsp      vs49,   vs27,   vs5
-    xvmulsp      vs50,   vs27,   vs6
-    xvmulsp      vs51,   vs27,   vs7
-
-
-.else
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
-
-    xvmaddasp      vs48,   vs27,   vs4
-    xvmaddasp      vs49,   vs27,   vs5
-    xvmaddasp      vs50,   vs27,   vs6
-    xvmaddasp      vs51,   vs27,   vs7
-.endif
- 
- 
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
-    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB)
-
-.else
-    addi        \AREG, \AREG, DISP8(\Index,32)  
-    addi        \BREG, \BREG,  DISP16(\Index,64)
-
-.endif
-.endif   
-     
-  
-.endm
-
-
-.macro SAVE8x4
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-#if !defined(TRMMKERNEL)  
-  lxv        vs36, 0(CO)
-  lxv        vs37, 0(T1)
-#endif  
-  add     T2, CO, T10  
-  add     T3, T1, T10 
-#if !defined(TRMMKERNEL)    
-  lxv        vs38, 0(T2)
-  lxv        vs39, 0(T3)   
-#endif   
-  add     T4, T2, T10 
-  add     T5, T3, T10
-#if !defined(TRMMKERNEL)    
-  lxv        vs40, 0(T4)
-  lxv        vs41, 0(T5)
-#endif  
-  add     T6, T4, T10 
-  add     T7, T5, T10
-#if !defined(TRMMKERNEL)    
-  lxv        vs42, 0(T6)
-  lxv        vs43, 0(T7)
-#endif
-  xxmrglw  vs0, vs35,vs32
-  xxmrglw  vs1, vs34,vs33 
-  xxmrglw  vs4, vs32,vs35
-  xxmrglw  vs5, vs33,vs34 
-
-
-  xxmrghw  vs2, vs35,vs32
-  xxmrghw  vs3, vs34,vs33 
-  xxmrghw  vs6, vs32,vs35
-  xxmrghw  vs7, vs33,vs34  
-
-  xxmrgld  vs24, vs1, vs0  
-  xxmrghd  vs25,vs5,vs4 
-
-  xxmrgld  vs26, vs2, vs3  
-  xxmrghd  vs27,vs6,vs7
-
-
-  xxmrglw  vs0, vs51,vs48
-  xxmrglw  vs1, vs50,vs49  
-  xxmrglw  vs4, vs48,vs51
-  xxmrglw  vs5, vs49,vs50 
-
-  xxmrghw  vs2, vs51,vs48
-  xxmrghw  vs3, vs50,vs49  
-  xxmrghw  vs6, vs48,vs51
-  xxmrghw  vs7, vs49,vs50   
-
-  xxmrgld  vs28, vs1, vs0  
-  xxmrghd  vs29,vs5,vs4
-
-  xxmrgld  vs30, vs2, vs3   
-  xxmrghd  vs31,vs6,vs7
-#if defined(TRMMKERNEL)
-
-  xvmulsp        vs36, vs24, alpha_r
-  xvmulsp        vs37, vs25, alpha_r 
-  xvmulsp        vs38, vs26, alpha_r
-  xvmulsp        vs39, vs27, alpha_r   
-  xvmulsp        vs40, vs28, alpha_r
-  xvmulsp        vs41, vs29, alpha_r 
-  xvmulsp        vs42, vs30, alpha_r
-  xvmulsp        vs43, vs31, alpha_r
-#else
-  xvmaddasp        vs36, vs24, alpha_r
-  xvmaddasp        vs37, vs25, alpha_r 
-  xvmaddasp        vs38, vs26, alpha_r
-  xvmaddasp        vs39, vs27, alpha_r   
-  xvmaddasp        vs40, vs28, alpha_r
-  xvmaddasp        vs41, vs29, alpha_r 
-  xvmaddasp        vs42, vs30, alpha_r
-  xvmaddasp        vs43, vs31, alpha_r
-#endif
-
-  stxv        vs36, 0(CO)
-  stxv        vs37, 0(T1) 
-  stxv        vs38, 0(T2)
-  stxv        vs39, 0(T3)   
-  stxv        vs40, 0(T4)
-  stxv        vs41, 0(T5) 
-  stxv        vs42, 0(T6)
-  stxv        vs43, 0(T7)
-
-
-  addi CO,CO,16
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=8 and M=2
-**********************************************************************************************/
-
- 
-.macro KERNEL8x2_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-
-.macro Zero8x2
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor      vs2,   vs2,   vs2
-    xxlxor      vs3,   vs3,   vs3
-       
-.endm
- 
-.macro KERNEL8x2
-  KERNEL8x2_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs27,   DISP8(\Index,16+\OffsetB)(\BREG)      
-    xxspltw   vs8,  vs36, 0 
-    xxspltw   vs9,  vs36, 1  
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8 
-    xvmulsp      vs2,   vs26,   vs9
-    xvmulsp      vs3,   vs27,   vs9 
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs26,   vs9
-    xvmaddasp      vs3,   vs27,   vs9
- 
- .endif
-   
-    addi        \AREG, \AREG, DISP2(\Index,8)  
-    addi        \BREG, \BREG, DISP8(\Index,32)
- 
-.endm
-
-.macro KERNEL8x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
-    lxv vs28,   DISP16(\Index,32+\OffsetB)(\BREG)
-    lxv vs29,   DISP16(\Index,48+\OffsetB)(\BREG)      
-    xxspltw   vs8,  vs4, 2  
-    xxspltw   vs9,  vs4, 3 
-    xxspltw   vs10, vs4, 0 
-    xxspltw   vs11, vs4, 1
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8 
-    xvmulsp      vs2,   vs26,   vs9
-    xvmulsp      vs3,   vs27,   vs9 
-
-    xvmulsp      vs0,   vs28,   vs10
-    xvmulsp      vs1,   vs29,   vs10 
-    xvmulsp      vs2,   vs28,   vs11
-    xvmulsp      vs3,   vs29,   vs11     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs26,   vs9
-    xvmaddasp      vs3,   vs27,   vs9
-
-    xvmaddasp      vs0,   vs28,   vs10
-    xvmaddasp      vs1,   vs29,   vs10 
-    xvmaddasp      vs2,   vs28,   vs11
-    xvmaddasp      vs3,   vs29,   vs11  
- .endif
-
- 
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP4(\Index,16)  
-    addi        \BREG, \BREG, DISP16(\Index,64)
-.endif 
-  
-.endm
-
-
-.macro SAVE8x2
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC  
-  add     T2, CO, T10  
-  add     T3, T1, T10     
-  add     T4, T2, T10 
-  add     T5, T3, T10 
-  add     T6, T4, T10 
-  add     T7, T5, T10 
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs4,alpha_r
-/* v0 corresponds to vs32, do not forget*/
-#if !defined(TRMMKERNEL)
-  lxssp  v0,0(CO) 
-  lxssp  v1,4(CO) 
-
-  lxssp  v2,0(T1)
-  lxssp  v3,4(T1)
-
-  lxssp  v4,0(T2)
-  lxssp  v5,4(T2)
-
-  lxssp  v6,0(T3)
-  lxssp  v7,4(T3)
-
-  lxssp  v8,0(T4)
-  lxssp  v9,4(T4)
-
-  lxssp  v10,0(T5)
-  lxssp  v11,4(T5)
-
-  lxssp  v12,0(T6)
-  lxssp  v13,4(T6)
-
-  lxssp  v14,0(T7)
-  lxssp  v15,4(T7)
-#endif
-  xscvspdp  vs5, vs2
-  xxspltw   vs6, vs2, 1 
-  xxspltw   vs7, vs2, 2 
-  xxspltw   vs8, vs2, 3  
-  xscvspdp  vs6,vs6
-  xscvspdp  vs7,vs7
-  xscvspdp  vs8,vs8
-
-  xscvspdp  vs24, vs0
-  xxspltw   vs25, vs0, 1 
-  xxspltw   vs26, vs0, 2 
-  xxspltw   vs27, vs0, 3  
-  xscvspdp  vs25,vs25
-  xscvspdp  vs26,vs26
-  xscvspdp  vs27,vs27
-
-  xscvspdp  vs9, vs3
-  xxspltw   vs10, vs3, 1 
-  xxspltw   vs11, vs3, 2 
-  xxspltw   vs12, vs3, 3  
-  xscvspdp  vs10,vs10
-  xscvspdp  vs11,vs11
-  xscvspdp  vs12,vs12
-
-  xscvspdp  vs28, vs1
-  xxspltw   vs29, vs1, 1 
-  xxspltw   vs30, vs1, 2 
-  xxspltw   vs31, vs1, 3  
-  xscvspdp  vs29,vs29
-  xscvspdp  vs30,vs30
-  xscvspdp  vs31,vs31
-
-
-
-
-#if defined(TRMMKERNEL)
-  xsmuldp  vs32,vs8, vs4 
-  xsmuldp  vs33,vs27, vs4 
-
-  xsmuldp  vs34,vs7, vs4 
-  xsmuldp  vs35,vs26, vs4 
-
-  xsmuldp  vs36,vs6, vs4 
-  xsmuldp  vs37,vs25, vs4  
-
-  xsmuldp  vs38,vs5, vs4 
-  xsmuldp  vs39,vs24, vs4  
-
-  xsmuldp  vs40,vs12, vs4 
-  xsmuldp  vs41,vs31, vs4
-
-  xsmuldp  vs42,vs11, vs4 
-  xsmuldp  vs43,vs30, vs4  
-
-  xsmuldp  vs44,vs10, vs4 
-  xsmuldp  vs45,vs29, vs4 
-
-  xsmuldp  vs46,vs9, vs4 
-  xsmuldp  vs47,vs28, vs4      
-#else
-  xsmaddadp  vs32,vs8, vs4 
-  xsmaddadp  vs33,vs27, vs4 
-
-  xsmaddadp  vs34,vs7, vs4 
-  xsmaddadp  vs35,vs26, vs4 
-
-  xsmaddadp  vs36,vs6, vs4 
-  xsmaddadp  vs37,vs25, vs4  
-
-  xsmaddadp  vs38,vs5, vs4 
-  xsmaddadp  vs39,vs24, vs4  
-
-  xsmaddadp  vs40,vs12, vs4 
-  xsmaddadp  vs41,vs31, vs4
-
-  xsmaddadp  vs42,vs11, vs4 
-  xsmaddadp  vs43,vs30, vs4  
-
-  xsmaddadp  vs44,vs10, vs4 
-  xsmaddadp  vs45,vs29, vs4 
-
-  xsmaddadp  vs46,vs9, vs4 
-  xsmaddadp  vs47,vs28, vs4     
-#endif  
-
-  stxssp  v0,0(CO) 
-  stxssp  v1,4(CO) 
-
-  stxssp  v2,0(T1)
-  stxssp  v3,4(T1)
-
-  stxssp  v4,0(T2)
-  stxssp  v5,4(T2)
-
-  stxssp  v6,0(T3)
-  stxssp  v7,4(T3)
-
-  stxssp  v8,0(T4)
-  stxssp  v9,4(T4)
-
-  stxssp  v10,0(T5)
-  stxssp  v11,4(T5)
-
-  stxssp  v12,0(T6)
-  stxssp  v13,4(T6)
-
-  stxssp  v14,0(T7)
-  stxssp  v15,4(T7)
- 
-
-  addi CO,CO,8
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=8 and M=1
-**********************************************************************************************/
-.macro KERNEL8x1_4   OffsetA,OffsetB, Index,IsLast
-  KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro Zero8x1
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1  
-.endm
-
-.macro KERNEL8x1
-  KERNEL8x1_1 AO,BO, 0 
-.endm
-
-.macro KERNEL8x1_2
-  KERNEL8x1_2_1 AO,BO, 0 
-.endm
-
-.macro KERNEL8x1_1 AREG,BREG,First 
-    lxvwsx vs8,  0, \AREG
-    lxv vs26,   0(\BREG)
-    lxv vs27,   16(\BREG)      
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8  
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
- .endif
-    addi        \AREG, \AREG,  4  
-    addi        \BREG, \BREG,  32
-.endm
-
-.macro KERNEL8x1_2_1 AREG,BREG,First 
-    lxsd v4,    0(\AREG)
-    lxv vs26,   0(\BREG)
-    lxv vs27,  16(\BREG)      
-    lxv vs28,  32(\BREG)
-    lxv vs29,  48(\BREG) 
-    xxspltw   vs8,  vs36, 1 
-    xxspltw   vs9,  vs36, 0  
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8  
-    xvmulsp      vs0,   vs28,   vs9
-    xvmulsp      vs1,   vs29,   vs9     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
-    xvmaddasp      vs0,   vs28,   vs9
-    xvmaddasp      vs1,   vs29,   vs9 
- .endif
-    addi        \AREG, \AREG,  8 
-    addi        \BREG, \BREG,  64
-.endm
-
-.macro KERNEL8x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
-    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
-    xxspltw   vs8,  vs4, 3 
-    xxspltw   vs9,  vs4, 2 
-    xxspltw   vs10, vs4, 1 
-    xxspltw   vs11, vs4, 0
-    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
-    lxv vs28,   DISP32(\Index,32+\OffsetB)(\BREG)
-    lxv vs29,   DISP32(\Index,48+\OffsetB)(\BREG) 
-    lxv vs30,   DISP32(\Index,64+ 0+\OffsetB)(\BREG)
-    lxv vs31,   DISP32(\Index,64+16+\OffsetB)(\BREG)
-    lxv vs32,   DISP32(\Index,64+32+\OffsetB)(\BREG)
-    lxv vs33,   DISP32(\Index,64+48+\OffsetB)(\BREG)         
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8  
-    xvmulsp      vs0,   vs28,   vs9
-    xvmulsp      vs1,   vs29,   vs9     
-    xvmulsp      vs0,   vs30,   vs10
-    xvmulsp      vs1,   vs31,   vs10  
-    xvmulsp      vs0,   vs32,   vs11
-    xvmulsp      vs1,   vs33,   vs11     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
-    xvmaddasp      vs0,   vs28,   vs9
-    xvmaddasp      vs1,   vs29,   vs9     
-    xvmaddasp      vs0,   vs30,   vs10
-    xvmaddasp      vs1,   vs31,   vs10  
-    xvmaddasp      vs0,   vs32,   vs11
-    xvmaddasp      vs1,   vs33,   vs11  
- .endif
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP4(\Index,16)  
-    addi        \BREG, \BREG, DISP32(\Index,128)
-.endif 
-.endm
-
-.macro SAVE8x1
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC  
-  add     T2, CO, T10  
-  add     T3, T1, T10     
-  add     T4, T2, T10 
-  add     T5, T3, T10 
-  add     T6, T4, T10 
-  add     T7, T5, T10 
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs4,alpha_r
-/* v0 corresponds to vs32, do not forget*/
-#if !defined(TRMMKERNEL)
-  lxssp  v0,0(CO)  
-  lxssp  v2,0(T1) 
-  lxssp  v4,0(T2) 
-  lxssp  v6,0(T3) 
-  lxssp  v8,0(T4) 
-  lxssp  v10,0(T5) 
-  lxssp  v12,0(T6) 
-  lxssp  v14,0(T7)
-#endif
-  xscvspdp  vs24, vs0
-  xxspltw   vs25, vs0, 1 
-  xxspltw   vs26, vs0, 2 
-  xxspltw   vs27, vs0, 3  
-  xscvspdp  vs25,vs25
-  xscvspdp  vs26,vs26
-  xscvspdp  vs27,vs27
-  xscvspdp  vs28, vs1
-  xxspltw   vs29, vs1, 1 
-  xxspltw   vs30, vs1, 2 
-  xxspltw   vs31, vs1, 3  
-  xscvspdp  vs29,vs29
-  xscvspdp  vs30,vs30
-  xscvspdp  vs31,vs31
-#if defined(TRMMKERNEL)
-  xsmuldp  vs32,vs27, vs4 
-  xsmuldp  vs34,vs26, vs4 
-  xsmuldp  vs36,vs25, vs4 
-  xsmuldp  vs38,vs24, vs4 
-  xsmuldp  vs40,vs31, vs4 
-  xsmuldp  vs42,vs30, vs4 
-  xsmuldp  vs44,vs29, vs4 
-  xsmuldp  vs46,vs28, vs4 
-#else
-  xsmaddadp  vs32,vs27, vs4 
-  xsmaddadp  vs34,vs26, vs4 
-  xsmaddadp  vs36,vs25, vs4 
-  xsmaddadp  vs38,vs24, vs4 
-  xsmaddadp  vs40,vs31, vs4 
-  xsmaddadp  vs42,vs30, vs4 
-  xsmaddadp  vs44,vs29, vs4 
-  xsmaddadp  vs46,vs28, vs4  
-#endif  
-  stxssp  v0,0(CO)  
-  stxssp  v2,0(T1) 
-  stxssp  v4,0(T2) 
-  stxssp  v6,0(T3) 
-  stxssp  v8,0(T4) 
-  stxssp  v10,0(T5) 
-  stxssp  v12,0(T6) 
-  stxssp  v14,0(T7) 
-  addi CO,CO,4
-.endm
-
-
-
-/**********************************************************************************************
-* Macros for N=4 and M=16
-**********************************************************************************************/
-
-.macro LOAD4x16_1
-   LOAD4x16 1
-.endm
-
-.macro LOAD4x16_0
-   LOAD4x16 0
-.endm
-
-.macro KERNEL4x16_L1_L4  Index,IsLast
-  KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL4x16_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro Zero4X16
-    xxlxor		vs32,	vs32,	vs32
-    xxlxor		vs33,	vs33,	vs33
-	xxlxor		vs34,	vs34,	vs34
-	xxlxor		vs35,	vs35,	vs35
-	xxlxor		vs36,	vs36,	vs36
-	xxlxor		vs37,	vs37,	vs37
-	xxlxor		vs38,	vs38,	vs38
-	xxlxor		vs39,	vs39,	vs39
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-	xxlxor		vs44,	vs44,	vs44
-	xxlxor		vs45,	vs45,	vs45
-	xxlxor		vs46,	vs46,	vs46
-	xxlxor		vs47,	vs47,	vs47	
-.endm
-
-.macro LOAD4x16  Zero
-
-	lxv	vs24,	0(BO) 
-	lxv	vs0,	 0(AO)
-	lxv	vs1,	16(AO)
-	lxv	vs2,	32(AO)
-	lxv	vs3,	48(AO)
-	xxperm  	vs26,	vs24,		permute_mask 	
-	xxpermdi	vs25,	vs24,	vs24,2 
-	xxpermdi	vs27,	vs26,	vs26,2	 	
-
-.if \Zero==1 
-    xxlxor		vs32,	vs32,	vs32
-    xxlxor		vs33,	vs33,	vs33
-	xxlxor		vs34,	vs34,	vs34
-	xxlxor		vs35,	vs35,	vs35
-	xxlxor		vs36,	vs36,	vs36
-	xxlxor		vs37,	vs37,	vs37
-	xxlxor		vs38,	vs38,	vs38
-	xxlxor		vs39,	vs39,	vs39
-	xxlxor		vs40,	vs40,	vs40
-	xxlxor		vs41,	vs41,	vs41
-	xxlxor		vs42,	vs42,	vs42
-	xxlxor		vs43,	vs43,	vs43
-	xxlxor		vs44,	vs44,	vs44
-	xxlxor		vs45,	vs45,	vs45
-	xxlxor		vs46,	vs46,	vs46
-	xxlxor		vs47,	vs47,	vs47
- 
-.endif
-.endm
-
-.macro END4x16_NORMAL
-  END4x16 0, AO, BO, 64,16 
-.endm
-
-.macro END4x16 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-    xvmulsp     vs34, vs2,vs24  
-    xvmulsp     vs35, vs3,vs24  
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-    xvmulsp     vs38, vs2,vs25  
-    xvmulsp     vs39, vs3,vs25
-
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-    xvmulsp     vs42, vs2,vs26  
-    xvmulsp     vs43, vs3,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
-    xvmulsp     vs46, vs2,vs27  
-    xvmulsp     vs47, vs3,vs27
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-    xvmaddasp       vs34, vs2,vs24  
-    xvmaddasp       vs35, vs3,vs24  
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-    xvmaddasp       vs38, vs2,vs25  
-    xvmaddasp       vs39, vs3,vs25 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-    xvmaddasp       vs42, vs2,vs26  
-    xvmaddasp       vs43, vs3,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-    xvmaddasp       vs46, vs2,vs27  
-    xvmaddasp       vs47, vs3,vs27
-
-.endif
-.endm  
-
-.macro KERNEL4x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-	lxv	vs8,	DISP16(\Index, 0+\OffsetB)(\BREG) 
-
- 	lxv	vs4,	DISP64(\Index, 0+\OffsetA)(\AREG)
-	lxv	vs5,	DISP64(\Index,16+\OffsetA)(\AREG)
-	lxv	vs6,	DISP64(\Index,32+\OffsetA)(\AREG)
-	lxv	vs7,	DISP64(\Index,48+\OffsetA)(\AREG) 
-
-	xxperm  	vs10,	vs8,		permute_mask 
-	xxpermdi	vs9,	vs8,	vs8,2	  
-
-    xvmaddasp		vs32, vs0,vs24
-	xvmaddasp		vs33, vs1,vs24
-	xvmaddasp		vs34, vs2,vs24	
-	xvmaddasp		vs35, vs3,vs24	 
-
-    xvmaddasp		vs36, vs0,vs25
-	xvmaddasp		vs37, vs1,vs25
-	xvmaddasp		vs38, vs2,vs25	
-	xvmaddasp		vs39, vs3,vs25 
-
- 	xxpermdi	vs11,	vs10,	vs10,2	 
-
-    xvmaddasp		vs40, vs0,vs26
-	xvmaddasp		vs41, vs1,vs26
-	xvmaddasp		vs42, vs2,vs26	
-	xvmaddasp		vs43, vs3,vs26
-
-    xvmaddasp		vs44, vs0,vs27
-	xvmaddasp		vs45, vs1,vs27
-	xvmaddasp		vs46, vs2,vs27	
-	xvmaddasp		vs47, vs3,vs27
-
-
-
-	lxv	vs24,	DISP16(\Index,16+\OffsetB)(\BREG) 
-
-	lxv	vs0,	DISP64(\Index,64+\OffsetA)(\AREG)
-	lxv	vs1,	DISP64(\Index,64+16+\OffsetA)(\AREG)
-	lxv	vs2,	DISP64(\Index,64+32+\OffsetA)(\AREG)
-	lxv	vs3,	DISP64(\Index,64+48+\OffsetA)(\AREG)
-
-	xxperm  	vs26,	vs24,		permute_mask 
-	xxpermdi	vs25,	vs24,	vs24,2	    
- 
-
-    xvmaddasp		vs32, vs4,vs8
-	xvmaddasp		vs33, vs5,vs8
-	xvmaddasp		vs34, vs6,vs8	
-	xvmaddasp		vs35, vs7,vs8	
- 
-    xvmaddasp		vs36, vs4,vs9
-	xvmaddasp		vs37, vs5,vs9
-	xvmaddasp		vs38, vs6,vs9	
-	xvmaddasp		vs39, vs7,vs9
-         
-	xxpermdi	vs27,	vs26,	vs26,2	 	
-
-    xvmaddasp		vs40, vs4,vs10
-	xvmaddasp		vs41, vs5,vs10
-	xvmaddasp		vs42, vs6,vs10	
-	xvmaddasp		vs43, vs7,vs10
-
-    xvmaddasp		vs44, vs4,vs11
-	xvmaddasp		vs45, vs5,vs11
-	xvmaddasp		vs46, vs6,vs11	
-	xvmaddasp		vs47, vs7,vs11
- 
-
-	lxv	vs8,	DISP16(\Index,32+\OffsetB)(\BREG) 
-
- 	lxv	vs4,	DISP64(\Index,128+0+\OffsetA)(\AREG)
-	lxv	vs5,	DISP64(\Index,128+16+\OffsetA)(\AREG)
-	lxv	vs6,	DISP64(\Index,128+32+\OffsetA)(\AREG)
-	lxv	vs7,	DISP64(\Index,128+48+\OffsetA)(\AREG) 
-
-	xxperm  	vs10,	vs8,		permute_mask 
-	xxpermdi	vs9,	vs8,	vs8,2	  
-
-    xvmaddasp		vs32, vs0,vs24
-	xvmaddasp		vs33, vs1,vs24
-	xvmaddasp		vs34, vs2,vs24	
-	xvmaddasp		vs35, vs3,vs24	 
-
-    xvmaddasp		vs36, vs0,vs25
-	xvmaddasp		vs37, vs1,vs25
-	xvmaddasp		vs38, vs2,vs25	
-	xvmaddasp		vs39, vs3,vs25
-
- 	xxpermdi	vs11,	vs10,	vs10,2	 
-
-    xvmaddasp		vs40, vs0,vs26
-	xvmaddasp		vs41, vs1,vs26
-	xvmaddasp		vs42, vs2,vs26	
-	xvmaddasp		vs43, vs3,vs26
-
-    xvmaddasp		vs44, vs0,vs27
-	xvmaddasp		vs45, vs1,vs27
-	xvmaddasp		vs46, vs2,vs27	
-	xvmaddasp		vs47, vs3,vs27
-
- 
- 
-.if \Complete==0
-	lxv	vs24,	DISP16(\Index,48+\OffsetB)(\BREG) 
-
-	lxv	vs0,	DISP64(\Index,192+\OffsetA)(\AREG)
-	lxv	vs1,	DISP64(\Index,192+16+\OffsetA)(\AREG) 
-	lxv	vs2,	DISP64(\Index,192+32+\OffsetA)(\AREG)
-	lxv	vs3,	DISP64(\Index,192+48+\OffsetA)(\AREG)
-
-	xxperm  	vs26,	vs24,	permute_mask 	
-	xxpermdi	vs25,	vs24,	vs24,2  	
-
-.endif 
-.if \IsLast==1	
-.if \Complete==1
-  
-	addi		\BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
-	addi		\AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
-.else
-  
-	addi		\BREG, \BREG,  DISP16(\Index,64)
-	addi		\AREG, \AREG, DISP64(\Index,256)
-.endif
-.endif   
- 
-    xvmaddasp		vs32, vs4,vs8
-	xvmaddasp		vs33, vs5,vs8
-	xvmaddasp		vs34, vs6,vs8	
-	xvmaddasp		vs35, vs7,vs8	 
- 
-    xvmaddasp		vs36, vs4,vs9
-	xvmaddasp		vs37, vs5,vs9
-	xvmaddasp		vs38, vs6,vs9	
-	xvmaddasp		vs39, vs7,vs9
-  
-.if \Complete==0        
-	xxpermdi	vs27,	vs26,	vs26,2	 
- 	
-.endif
- 
-    xvmaddasp		vs40, vs4,vs10
-	xvmaddasp		vs41, vs5,vs10
-	xvmaddasp		vs42, vs6,vs10	
-	xvmaddasp		vs43, vs7,vs10
-
-    xvmaddasp		vs44, vs4,vs11
-	xvmaddasp		vs45, vs5,vs11
-	xvmaddasp		vs46, vs6,vs11	
-	xvmaddasp		vs47, vs7,vs11
-
- 
-
-.endm
-
-.macro KERNEL4x16 First
-
-  LOAD4x16 0
-  END4x16 \First, AO, BO, 64,16 
-.endm
-
-.macro KERNEL4x16_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-	
-	lxv	vs8,	DISP8(\Index, 0+\OffsetB)(\BREG) 
- 	lxv	vs4,	DISP32(\Index, 0+\OffsetA)(\AREG)
-	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
-	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
-	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
-
-	xxperm  	vs10,	vs8,		permute_mask 
-	xxpermdi	vs9,	vs8,	vs8,2	  
-.if \First==1
-    xvmulsp		vs32, vs0,vs24
-	xvmulsp		vs33, vs1,vs24
-	xvmulsp		vs34, vs2,vs24	
-	xvmulsp		vs35, vs3,vs24	
-
-    xvmulsp		vs36, vs0,vs25
-	xvmulsp		vs37, vs1,vs25
-	xvmulsp		vs38, vs2,vs25	
-	xvmulsp		vs39, vs3,vs25	
-.else
-    xvmaddasp		vs32, vs0,vs24
-	xvmaddasp		vs33, vs1,vs24
-	xvmaddasp		vs34, vs2,vs24	
-	xvmaddasp		vs35, vs3,vs24
-
-    xvmaddasp		vs36, vs0,vs25
-	xvmaddasp		vs37, vs1,vs25
-	xvmaddasp		vs38, vs2,vs25	
-	xvmaddasp		vs39, vs3,vs25		
-.endif
-
- 	xxpermdi	vs11,	vs10,	vs10,2	 	
- 
-.if \First==1  
-    xvmulsp		vs40, vs0,vs26
-	xvmulsp		vs41, vs1,vs26
-	xvmulsp		vs42, vs2,vs26	
-	xvmulsp		vs43, vs3,vs26
-
-    xvmulsp		vs44, vs0,vs27
-	xvmulsp		vs45, vs1,vs27
-	xvmulsp		vs46, vs2,vs27	
-	xvmulsp		vs47, vs3,vs27
-
-  
-.else 
-    xvmaddasp		vs40, vs0,vs26
-	xvmaddasp		vs41, vs1,vs26
-	xvmaddasp		vs42, vs2,vs26	
-	xvmaddasp		vs43, vs3,vs26
-
-    xvmaddasp		vs44, vs0,vs27
-	xvmaddasp		vs45, vs1,vs27
-	xvmaddasp		vs46, vs2,vs27	
-	xvmaddasp		vs47, vs3,vs27
- 
-
-.endif
-.if \Complete==0
-	lxv	vs24,	DISP8(\Index,16+\OffsetB)(\BREG) 
-	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
-	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG)
-	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
-	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
-
-	xxperm  	vs26,	vs24,	permute_mask 
-	xxpermdi	vs25,	vs24,	vs24,2	  
-.endif    
-.if \IsLast==1	
-.if \Complete==1
- 	addi		\BREG, \BREG,  DISP8(\Index,16+\OffsetB) 
-	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)
-
-.else
-  	addi		\BREG, \BREG,  DISP8(\Index,32)
-	addi		\AREG, \AREG, DISP32(\Index,128) 
-.endif
-.endif
-
-.if \First==1
-    xvmulsp		vs32, vs4,vs8
-	xvmulsp		vs33, vs5,vs8
-	xvmulsp		vs34, vs6,vs8	
-	xvmulsp		vs35, vs7,vs8
-
-    xvmulsp		vs36, vs4,vs9
-	xvmulsp		vs37, vs5,vs9
-	xvmulsp		vs38, vs6,vs9	
-	xvmulsp		vs39, vs7,vs9
-.else
-    xvmaddasp		vs32, vs4,vs8
-	xvmaddasp		vs33, vs5,vs8
-	xvmaddasp		vs34, vs6,vs8	
-	xvmaddasp		vs35, vs7,vs8	
-
-    xvmaddasp		vs36, vs4,vs9
-	xvmaddasp		vs37, vs5,vs9
-	xvmaddasp		vs38, vs6,vs9	
-	xvmaddasp		vs39, vs7,vs9
-.endif 
- 
-.if \Complete==0        
-	xxpermdi	vs27,	vs26,	vs26,2	 
- 
-.endif
-.if \First==1  
-    xvmulsp		vs40, vs4,vs10
-	xvmulsp		vs41, vs5,vs10
-	xvmulsp		vs42, vs6,vs10	
-	xvmulsp		vs43, vs7,vs10
-
-    xvmulsp		vs44, vs4,vs11
-	xvmulsp		vs45, vs5,vs11
-	xvmulsp		vs46, vs6,vs11	
-	xvmulsp		vs47, vs7,vs11
-
- 
-
-.else 
-    xvmaddasp		vs40, vs4,vs10
-	xvmaddasp		vs41, vs5,vs10
-	xvmaddasp		vs42, vs6,vs10	
-	xvmaddasp		vs43, vs7,vs10
-
-    xvmaddasp		vs44, vs4,vs11
-	xvmaddasp		vs45, vs5,vs11
-	xvmaddasp		vs46, vs6,vs11	
-	xvmaddasp		vs47, vs7,vs11
-
- 
-
-.endif
-
-.endm
-
- 
-.macro SAVE4x16
-
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-
-  add     T2, CO, T10  
-  add     T3, T1, T10  
-
-  
- 
-    xxmrglw     vs8,    vs32,   vs44
-    xxmrglw     vs10,   vs36,   vs40  
-
-    xxmrghw     vs1,    vs32,   vs44
-    xxmrghw     vs0,    vs36,   vs40
-
-    xxmrglw     vs12,   vs33,   vs45
-    xxmrglw     vs14,   vs37,   vs41  
-
-    xxmrghw     vs2,    vs37,   vs41
-    xxmrghw     vs3,    vs33,   vs45
-
-    xxmrglw     vs16,   vs34,   vs46
-    xxmrglw     vs18,   vs38,   vs42   
-
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10 
-
-    xxmrghw     vs4,    vs38,   vs42
-    xxmrghw     vs5,    vs34,   vs46
-
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
-
-    xxmrglw     vs24,   vs35,   vs47
-    xxmrglw     vs26,   vs39,   vs43  
-
-    xxlor      vs17,    vs16,   vs16
-    xxlor      vs19,    vs18,   vs18
-
-    xxmrghw     vs30,   vs39,   vs43 
-    xxmrghw     vs31,   vs35,   vs47
-
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-#ifndef TRMMKERNEL    
-    lxv        vs32, 0(CO)
-    lxv        vs33, 16(CO) 
-    lxv        vs34, 32(CO)  
-    lxv        vs35, 48(CO)      
-#endif
-    xxlor      vs25,    vs24,   vs24
-    xxlor      vs27,    vs26,   vs26 
-
-#ifndef TRMMKERNEL    
-    lxv        vs36, 0(T1)
-    lxv        vs37, 16(T1) 
-    lxv        vs38, 32(T1)  
-    lxv        vs39, 48(T1)     
-#endif
-#ifndef TRMMKERNEL       
-    lxv        vs40, 0(T2)
-    lxv        vs41, 16(T2) 
-    lxv        vs42, 32(T2)  
-    lxv        vs43, 48(T2)     
-#endif  
-#ifndef TRMMKERNEL    
-    lxv        vs44, 0(T3)
-    lxv        vs45, 16(T3) 
-    lxv        vs46, 32(T3)  
-    lxv        vs47, 48(T3)                 
-#endif  
-
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-       
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
-
-    xxperm     vs16,    vs4,    save_permute_1
-    xxperm     vs18,    vs5,    save_permute_1
-      
-    xxperm     vs17,    vs4,    save_permute_2   
-    xxperm     vs19,    vs5,    save_permute_2      
-
-    xxperm     vs24,    vs30,   save_permute_1
-    xxperm     vs26,    vs31,   save_permute_1 
-         
-    xxperm     vs25,    vs30,   save_permute_2   
-    xxperm     vs27,    vs31,   save_permute_2  
-
-
-    /* multiply add normal way */
- 
-#ifdef TRMMKERNEL
-    xvmulsp     vs32,   vs8,    alpha_r 
-    xvmulsp     vs33,   vs12,   alpha_r   
-    xvmulsp     vs34,   vs16,   alpha_r 
-    xvmulsp     vs35,   vs24,   alpha_r  
-    xvmulsp     vs36,   vs9,    alpha_r 
-    xvmulsp     vs37,   vs13,   alpha_r  
-    xvmulsp     vs38,   vs17,   alpha_r 
-    xvmulsp     vs39,   vs25,   alpha_r               
-#else 
-    xvmaddasp   vs32,   vs8,    alpha_r 
-    xvmaddasp   vs33,   vs12,   alpha_r   
-    xvmaddasp   vs34,   vs16,   alpha_r 
-    xvmaddasp   vs35,   vs24,   alpha_r  
-    xvmaddasp   vs36,   vs9,    alpha_r 
-    xvmaddasp   vs37,   vs13,   alpha_r   
-    xvmaddasp   vs38,   vs17,   alpha_r 
-    xvmaddasp   vs39,   vs25,   alpha_r         
-#endif 
-
-
-
-#ifdef TRMMKERNEL
-    xvmulsp     vs40,   vs10,   alpha_r 
-    xvmulsp     vs41,   vs14,   alpha_r 
-    xvmulsp     vs42,   vs18,   alpha_r 
-    xvmulsp     vs43,   vs26,   alpha_r  
-    xvmulsp     vs44,   vs11,   alpha_r 
-    xvmulsp     vs45,   vs15,   alpha_r  
-    xvmulsp     vs46,   vs19,   alpha_r 
-    xvmulsp     vs47,   vs27,   alpha_r                   
-#else
-
-    xvmaddasp   vs40,   vs10,   alpha_r 
-    xvmaddasp   vs41,   vs14,   alpha_r   
-    xvmaddasp   vs42,   vs18,   alpha_r 
-    xvmaddasp   vs43,   vs26,   alpha_r  
-    xvmaddasp   vs44,   vs11,   alpha_r 
-    xvmaddasp   vs45,   vs15,   alpha_r 
-    xvmaddasp   vs46,   vs19,   alpha_r 
-    xvmaddasp   vs47,   vs27,   alpha_r  
-        
-#endif  
-
-    stxv        vs32, 0(CO)
-    stxv        vs33, 16(CO) 
-    stxv        vs34, 32(CO)  
-    stxv        vs35, 48(CO)  
-
-    stxv        vs36, 0(T1)
-    stxv        vs37, 16(T1)  
-    stxv        vs38, 32(T1)  
-    stxv        vs39, 48(T1)
-
-    stxv        vs40, 0(T2)
-    stxv        vs41, 16(T2)  
-    stxv        vs42, 32(T2)  
-    stxv        vs43, 48(T2)  
-    stxv        vs44, 0(T3)
-    stxv        vs45, 16(T3) 
-    stxv        vs46, 32(T3)  
-    stxv        vs47, 48(T3)
-   
-    addi CO,CO,64
-
-
-.endm
-
-
-
-/**********************************************************************************************
-* Macros for N=4 and M=8
-**********************************************************************************************/
-
-.macro LOAD4x8_1
-   LOAD4x8 1
-.endm
-
-.macro LOAD4x8_0
-   LOAD4x8 0
-.endm
-
-.macro KERNEL4x8_L1_L4  Index,IsLast
-  KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL4x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro END4x8_NORMAL
-  END4x8 0, AO, BO, 32,16 
-.endm
-
-.macro Zero4X8
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33
- 
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
- 
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41
- 
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45
-    
-.endm
-
-.macro LOAD4x8  Zero
-
-    lxv vs24,   0(BO) 
-    lxv vs0,     0(AO)
-    lxv vs1,    16(AO)
-
-    xxperm      vs26,   vs24,       permute_mask    
-    xxpermdi    vs25,   vs24,   vs24,2      
-
-    xxpermdi    vs27,   vs26,   vs26,2      
-
-.if \Zero==1 
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs36,   vs36,   vs36
-    xxlxor      vs37,   vs37,   vs37
-    xxlxor      vs40,   vs40,   vs40
-    xxlxor      vs41,   vs41,   vs41 
-    xxlxor      vs44,   vs44,   vs44
-    xxlxor      vs45,   vs45,   vs45 
- 
-.endif
-.endm
-
-
-.macro END4x8 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
- 
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
- 
-
-.endif
-.endm  
-
-.macro KERNEL4x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG) 
-
-    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
-
-    xxperm      vs10,   vs8,        permute_mask    
-    xxpermdi    vs9,    vs8,    vs8,2     
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xxpermdi    vs11,   vs10,   vs10,2   
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
- 
-
-    lxv vs24,   DISP16(\Index,16+\OffsetB)(\BREG) 
-
-    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
-    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
-
-    xxperm      vs26,   vs24,       permute_mask   
-    xxpermdi    vs25,   vs24,   vs24,2      
-
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-    xxpermdi    vs27,   vs26,   vs26,2       
-
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11
-
- 
-
-    lxv vs8,    DISP16(\Index,32+\OffsetB)(\BREG) 
-
-    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
-    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
-
-    xxperm      vs10,   vs8,        permute_mask     
-    xxpermdi    vs9,    vs8,    vs8,2     
-
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-    xxpermdi    vs11,   vs10,   vs10,2   
-
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
-
- 
-
-.if \Complete==0
-    lxv vs24,   DISP16(\Index,48+\OffsetB)(\BREG) 
-
-    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
-    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
-
-    xxperm      vs26,   vs24,   permute_mask     
-    xxpermdi    vs25,   vs24,   vs24,2      
-
-.endif 
-.if \IsLast==1  
-.if \Complete==1
-  
-    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
-    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
-.else
-  
-    addi        \BREG, \BREG,  DISP16(\Index,64)
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif
-.endif   
- 
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-.if \Complete==0        
-    xxpermdi    vs27,   vs26,   vs26,2    
-    
-.endif
- 
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11
-
- 
-
-.endm
-
-.macro KERNEL4x8 First
-
-  LOAD4x8 0
-  END4x8 \First, AO, BO, 32,16  
-.endm
-
-.macro KERNEL4x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-    
-    lxv vs8,     DISP8(\Index, 0+\OffsetB)(\BREG) 
-    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
-
-    xxperm      vs10,   vs8,        permute_mask  
-    xxpermdi    vs9,    vs8,    vs8,2     
-.if \First==1
-    xvmulsp     vs32, vs0,vs24
-    xvmulsp     vs33, vs1,vs24
-
-    xvmulsp     vs36, vs0,vs25
-    xvmulsp     vs37, vs1,vs25
-
-.else
-    xvmaddasp       vs32, vs0,vs24
-    xvmaddasp       vs33, vs1,vs24
-
-    xvmaddasp       vs36, vs0,vs25
-    xvmaddasp       vs37, vs1,vs25
-
-.endif
-
-    xxpermdi    vs11,   vs10,   vs10,2    
- 
-.if \First==1  
-    xvmulsp     vs40, vs0,vs26
-    xvmulsp     vs41, vs1,vs26
-
-    xvmulsp     vs44, vs0,vs27
-    xvmulsp     vs45, vs1,vs27
- 
-
-.else 
-    xvmaddasp       vs40, vs0,vs26
-    xvmaddasp       vs41, vs1,vs26
-
-    xvmaddasp       vs44, vs0,vs27
-    xvmaddasp       vs45, vs1,vs27
- 
-
-.endif
-.if \Complete==0
-    lxv vs24,    DISP8(\Index,16+\OffsetB)(\BREG) 
-
-    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
-    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
-
-    xxperm      vs26,   vs24,   permute_mask   
-    xxpermdi    vs25,   vs24,   vs24,2    
-.endif    
-.if \IsLast==1  
-.if \Complete==1
-    addi        \BREG, \BREG,   DISP8(\Index,16+\OffsetB) 
-    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
-
-.else
-    addi        \BREG, \BREG,   DISP8(\Index,32)
-    addi        \AREG, \AREG,  DISP16(\Index,64) 
-.endif
-.endif
-
-.if \First==1
-    xvmulsp     vs32, vs4,vs8
-    xvmulsp     vs33, vs5,vs8
-
-    xvmulsp     vs36, vs4,vs9
-    xvmulsp     vs37, vs5,vs9
-
-.else
-    xvmaddasp       vs32, vs4,vs8
-    xvmaddasp       vs33, vs5,vs8
-
-    xvmaddasp       vs36, vs4,vs9
-    xvmaddasp       vs37, vs5,vs9
-
-.endif 
- 
-.if \Complete==0        
-    xxpermdi    vs27,   vs26,   vs26,2   
- 
-.endif
-.if \First==1  
-    xvmulsp     vs40, vs4,vs10
-    xvmulsp     vs41, vs5,vs10
-
-    xvmulsp     vs44, vs4,vs11
-    xvmulsp     vs45, vs5,vs11
- 
-.else 
-    xvmaddasp       vs40, vs4,vs10
-    xvmaddasp       vs41, vs5,vs10
-
-    xvmaddasp       vs44, vs4,vs11
-    xvmaddasp       vs45, vs5,vs11 
-
-.endif
-
-.endm
-
-
-.macro SAVE4x8 
- 
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-
-  add     T2, CO, T10  
-  add     T3, T1, T10  
-
- 
-
-#ifndef TRMMKERNEL    
-    lxv        vs34, 0(CO)
-    lxv        vs35, 16(CO)      
-    lxv        vs38, 0(T1)
-    lxv        vs39, 16(T1)  
-    lxv        vs42, 0(T2)
-    lxv        vs43, 16(T2)     
-    lxv        vs46, 0(T3)
-    lxv        vs47, 16(T3)  
-
- 
-#endif  
-
-    xxmrglw     vs8,    vs32,   vs44
-    xxmrglw     vs10,   vs36,   vs40  
-
-    xxmrghw     vs1,    vs32,   vs44
-    xxmrghw     vs0,    vs36,   vs40
-
-    xxmrglw     vs12,   vs33,   vs45
-    xxmrglw     vs14,   vs37,   vs41  
-
-    xxmrghw     vs2,    vs37,   vs41
-    xxmrghw     vs3,    vs33,   vs45
-
-    xxlor      vs9, vs8,    vs8
-    xxlor      vs11,    vs10,   vs10 
- 
-    xxlor      vs13,    vs12,   vs12
-    xxlor      vs15,    vs14,   vs14
-
-    xxperm      vs8,    vs0,    save_permute_1
-    xxperm      vs10,   vs1,    save_permute_1
-    xxperm      vs9,    vs0,    save_permute_2  
-    xxperm      vs11,   vs1,    save_permute_2      
-
-    xxperm     vs12,    vs2,    save_permute_1
-    xxperm     vs14,    vs3,    save_permute_1
-      
-    xxperm     vs13,    vs2,    save_permute_2   
-    xxperm     vs15,    vs3,    save_permute_2      
-
-
-    /* multiply add normal way */
- 
-#ifdef TRMMKERNEL
-    xvmulsp     vs34,   vs8,    alpha_r 
-    xvmulsp     vs35,   vs12,   alpha_r 
-    xvmulsp     vs38,   vs9,    alpha_r 
-    xvmulsp     vs39,   vs13,   alpha_r 
-    xvmulsp     vs42,   vs10,   alpha_r 
-    xvmulsp     vs43,   vs14,   alpha_r 
-    xvmulsp     vs46,   vs11,   alpha_r 
-    xvmulsp     vs47,   vs15,   alpha_r                    
-#else 
-    xvmaddasp   vs34,   vs8,    alpha_r 
-    xvmaddasp   vs35,   vs12,   alpha_r 
-    xvmaddasp   vs38,   vs9,    alpha_r 
-    xvmaddasp   vs39,   vs13,   alpha_r  
-    xvmaddasp   vs42,   vs10,   alpha_r 
-    xvmaddasp   vs43,   vs14,   alpha_r   
-    xvmaddasp   vs46,   vs11,   alpha_r 
-    xvmaddasp   vs47,   vs15,   alpha_r                     
-#endif     
- 
-    
-    stxv        vs34, 0(CO)
-    stxv        vs35, 16(CO)  
-    stxv        vs38, 0(T1)
-    stxv        vs39, 16(T1)  
-    stxv        vs42, 0(T2)
-    stxv        vs43, 16(T2)     
-    stxv        vs46, 0(T3)
-    stxv        vs47, 16(T3)  
-  
-
-    addi CO,CO,32
-
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=4 and M=4
-**********************************************************************************************/
-
-.macro LOAD4x4_1
-   LOAD4x4 1
-.endm
-
-.macro LOAD4x4_0
-   LOAD4x4 0
-.endm
-
-.macro KERNEL4x4_L1_L4  Index,IsLast
-  KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
-.endm
-
-.macro KERNEL4x4_I1_L4  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-.macro KERNEL4x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
-   KERNEL4x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro KERNEL4x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
-.endm
-
-.macro KERNEL4x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
-  KERNEL4x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
-.endm
-
-.macro Zero4X4
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
- 
-.endm
-
-.macro LOAD4x4  Zero
-
-    lxv vs0,     0(AO)
-    lxv vs24,   0(BO) 
-
-
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2        
-
-.if \Zero==1 
-    xxlxor      vs32,   vs32,   vs32
-    xxlxor      vs33,   vs33,   vs33 
-    xxlxor      vs34,   vs34,   vs34
-    xxlxor      vs35,   vs35,   vs35
- 
-.endif
-.endm
-
-.macro END4x4_NORMAL
-  END4x4 0, AO, BO, 16,16 
-.endm
-
-.macro END4x4 First, AREG, BREG, OffsetA, OffsetB
-
-.if \OffsetB != 0 
-    addi        \BREG, \BREG, \OffsetB 
-.endif
-.if \OffsetA != 0 
-    addi        \AREG, \AREG, \OffsetA 
-.endif  
-
-.if \First==1
-    xvmulsp      vs32,   vs24,   vs0
-    xvmulsp      vs33,   vs24,   vs1 
-    xvmulsp      vs34,   vs24,   vs2
-    xvmulsp      vs35,   vs24,   vs3  
-.else
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
- 
-
-.endif
-.endm  
-
-.macro KERNEL4x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
- 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
- 
-
-    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
-    lxv vs24,   DISP16(\Index, 16+\OffsetB)(\BREG)  
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
- 
- 
-
-    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
-    lxv vs26,   DISP16(\Index, 32+\OffsetB)(\BREG) 
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
- 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
- 
-
-.if \Complete==0 
-
-    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
-    lxv vs24,   DISP16(\Index, 48+\OffsetB)(\BREG) 
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-.endif
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
- 
-
- 
- 
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
-    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
-
-.else
-    addi        \AREG, \AREG, DISP16(\Index,64)  
-    addi        \BREG, \BREG,  DISP16(\Index,64)
-
-.endif
-.endif   
- 
- 
-.endm
-
-.macro KERNEL4x4 First
-    LOAD4x4 0
-    END4x4 \First, AO, BO, 16,16  
-.endm
-
-.macro KERNEL4x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
-    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
-
-    xxperm      vs6,   vs4,       permute_mask  
-    xxpermdi    vs5,   vs4,   vs4,2      
-    xxpermdi    vs7,   vs6,   vs6,2 
-.if \First==1
-    xvmulsp      vs32,   vs24,   vs0
-    xvmulsp      vs33,   vs24,   vs1 
-    xvmulsp      vs34,   vs24,   vs2
-    xvmulsp      vs35,   vs24,   vs3
- 
-.else 
-    xvmaddasp      vs32,   vs24,   vs0
-    xvmaddasp      vs33,   vs24,   vs1 
-    xvmaddasp      vs34,   vs24,   vs2
-    xvmaddasp      vs35,   vs24,   vs3
- 
-.endif
-
-.if \Complete==0 
-
-    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
-    lxv vs24,   DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    xxperm      vs2,   vs0,       permute_mask  
-    xxpermdi    vs1,   vs0,   vs0,2      
-    xxpermdi    vs3,   vs2,   vs2,2   
-.endif
-
-.if \First==1
-    xvmulsp      vs32,   vs26,   vs4
-    xvmulsp      vs33,   vs26,   vs5 
-    xvmulsp      vs34,   vs26,   vs6
-    xvmulsp      vs35,   vs26,   vs7 
-
-
-.else
-    xvmaddasp      vs32,   vs26,   vs4
-    xvmaddasp      vs33,   vs26,   vs5 
-    xvmaddasp      vs34,   vs26,   vs6
-    xvmaddasp      vs35,   vs26,   vs7
- 
-.endif
- 
- 
-.if \IsLast==1  
-.if \Complete==1
-    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
-    addi        \BREG, \BREG,  DISP8(\Index,16+\OffsetB)
-
-.else
-    addi        \AREG, \AREG, DISP8(\Index,32)  
-    addi        \BREG, \BREG,  DISP8(\Index,32)
-
-.endif
-.endif   
-     
-  
-.endm
-
-
-.macro SAVE4x4
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC 
-#if !defined(TRMMKERNEL)  
-  lxv        vs36, 0(CO)
-  lxv        vs37, 0(T1)
-#endif
-  add     T2, CO, T10  
-  add     T3, T1, T10 
-#if !defined(TRMMKERNEL)   
-  lxv        vs38, 0(T2)
-  lxv        vs39, 0(T3)    
-#endif   
-
-  xxmrglw  vs0, vs35,vs32
-  xxmrglw  vs1, vs34,vs33 
-  xxmrglw  vs4, vs32,vs35
-  xxmrglw  vs5, vs33,vs34 
-
-
-  xxmrghw  vs2, vs35,vs32
-  xxmrghw  vs3, vs34,vs33 
-  xxmrghw  vs6, vs32,vs35
-  xxmrghw  vs7, vs33,vs34  
-
-  xxmrgld  vs24, vs1, vs0  
-  xxmrghd  vs25,vs5,vs4 
-
-  xxmrgld  vs26, vs2, vs3  
-  xxmrghd  vs27,vs6,vs7
-
- #if defined(TRMMKERNEL)
-  xvmulsp        vs36, vs24, alpha_r
-  xvmulsp        vs37, vs25, alpha_r 
-  xvmulsp        vs38, vs26, alpha_r
-  xvmulsp        vs39, vs27, alpha_r 
-#else
-  xvmaddasp        vs36, vs24, alpha_r
-  xvmaddasp        vs37, vs25, alpha_r 
-  xvmaddasp        vs38, vs26, alpha_r
-  xvmaddasp        vs39, vs27, alpha_r   
- #endif
-  stxv        vs36, 0(CO)
-  stxv        vs37, 0(T1) 
-  stxv        vs38, 0(T2)
-  stxv        vs39, 0(T3)   
- 
-
-
-  addi CO,CO,16
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=4 and M=2
-**********************************************************************************************/
-
- 
-.macro KERNEL4x2_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-
-.macro Zero4x2
-    xxlxor      vs0,   vs0,   vs0 
-    xxlxor      vs2,   vs2,   vs2 
-       
-.endm
- 
-.macro KERNEL4x2
-  KERNEL4x2_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)      
-    xxspltw   vs8,  vs36, 0 
-    xxspltw   vs9,  vs36, 1  
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8 
-    xvmulsp      vs2,   vs26,   vs9 
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8  
-    xvmaddasp      vs2,   vs26,   vs9 
- 
- .endif
-   
-    addi        \AREG, \AREG, DISP2(\Index,8)  
-    addi        \BREG, \BREG, DISP4(\Index,16)
- 
-.endm
-
-.macro KERNEL4x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
-    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
-    lxv vs28,   DISP8(\Index,16+\OffsetB)(\BREG)       
-    xxspltw   vs8,  vs4, 2  
-    xxspltw   vs9,  vs4, 3 
-    xxspltw   vs10, vs4, 0 
-    xxspltw   vs11, vs4, 1
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8 
-    xvmulsp      vs2,   vs26,   vs9  
-
-    xvmulsp      vs0,   vs28,   vs10 
-    xvmulsp      vs2,   vs28,   vs11     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8 
-    xvmaddasp      vs2,   vs26,   vs9 
-
-    xvmaddasp      vs0,   vs28,   vs10 
-    xvmaddasp      vs2,   vs28,   vs11   
- .endif
-
- 
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP4(\Index,16)  
-    addi        \BREG, \BREG, DISP8(\Index,32)
-.endif 
-  
-.endm
-
-
-.macro SAVE4x2
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC  
-  add     T2, CO, T10  
-  add     T3, T1, T10     
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs4,alpha_r
-/* v0 corresponds to vs32, do not forget*/
-#if !defined(TRMMKERNEL)
-  lxssp  v0,0(CO) 
-  lxssp  v1,4(CO) 
-
-  lxssp  v2,0(T1)
-  lxssp  v3,4(T1)
-
-  lxssp  v4,0(T2)
-  lxssp  v5,4(T2)
-
-  lxssp  v6,0(T3)
-  lxssp  v7,4(T3)
-
-   
-#endif
-  xscvspdp  vs5, vs2
-  xxspltw   vs6, vs2, 1 
-  xxspltw   vs7, vs2, 2 
-  xxspltw   vs8, vs2, 3  
-  xscvspdp  vs6,vs6
-  xscvspdp  vs7,vs7
-  xscvspdp  vs8,vs8
-
-  xscvspdp  vs24, vs0
-  xxspltw   vs25, vs0, 1 
-  xxspltw   vs26, vs0, 2 
-  xxspltw   vs27, vs0, 3  
-  xscvspdp  vs25,vs25
-  xscvspdp  vs26,vs26
-  xscvspdp  vs27,vs27
- 
-
-#if defined(TRMMKERNEL)
-  xsmuldp  vs32,vs8, vs4 
-  xsmuldp  vs33,vs27, vs4 
-
-  xsmuldp  vs34,vs7, vs4 
-  xsmuldp  vs35,vs26, vs4 
-
-  xsmuldp  vs36,vs6, vs4 
-  xsmuldp  vs37,vs25, vs4  
-
-  xsmuldp  vs38,vs5, vs4 
-  xsmuldp  vs39,vs24, vs4  
-
-      
-#else
-  xsmaddadp  vs32,vs8, vs4 
-  xsmaddadp  vs33,vs27, vs4 
-
-  xsmaddadp  vs34,vs7, vs4 
-  xsmaddadp  vs35,vs26, vs4 
-
-  xsmaddadp  vs36,vs6, vs4 
-  xsmaddadp  vs37,vs25, vs4  
-
-  xsmaddadp  vs38,vs5, vs4 
-  xsmaddadp  vs39,vs24, vs4  
-
-    
-#endif  
-
-  stxssp  v0,0(CO) 
-  stxssp  v1,4(CO) 
-
-  stxssp  v2,0(T1)
-  stxssp  v3,4(T1)
-
-  stxssp  v4,0(T2)
-  stxssp  v5,4(T2)
-
-  stxssp  v6,0(T3)
-  stxssp  v7,4(T3)
-
- 
- 
-
-  addi CO,CO,8
-.endm
-
-
-/**********************************************************************************************
-* Macros for N=4 and M=1
-**********************************************************************************************/
-.macro KERNEL4x1_4   OffsetA,OffsetB, Index,IsLast
-  KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro Zero4x1
-    xxlxor      vs0,   vs0,   vs0 
-.endm
-
-.macro KERNEL4x1
-  KERNEL4x1_1 AO,BO, 0 
-.endm
-
-.macro KERNEL4x1_2
-  KERNEL4x1_2_1 AO,BO, 0 
-.endm
-
-.macro KERNEL4x1_1 AREG,BREG,First 
-    lxvwsx vs8,  0, \AREG
-    lxv vs26,   0(\BREG)       
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8  
-.else 
-    xvmaddasp      vs0,   vs26,   vs8  
- .endif
-    addi        \AREG, \AREG,  4  
-    addi        \BREG, \BREG,  16
-.endm
-
-.macro KERNEL4x1_2_1 AREG,BREG,First 
-    lxsd v4,    0(\AREG)
-    lxv vs26,   0(\BREG)      
-    lxv vs28,  16(\BREG) 
-    xxspltw   vs8,  vs36, 1 
-    xxspltw   vs9,  vs36, 0  
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8 
-    xvmulsp      vs0,   vs28,   vs9     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8  
-    xvmaddasp      vs0,   vs28,   vs9  
- .endif
-    addi        \AREG, \AREG,  8 
-    addi        \BREG, \BREG,  32
-.endm
-
-.macro KERNEL4x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
-    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
-    xxspltw   vs8,  vs4, 3 
-    xxspltw   vs9,  vs4, 2 
-    xxspltw   vs10, vs4, 1 
-    xxspltw   vs11, vs4, 0
-    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
-    lxv vs28,   DISP16(\Index,16+\OffsetB)(\BREG)  
-    lxv vs30,   DISP16(\Index,32+\OffsetB)(\BREG) 
-    lxv vs32,   DISP16(\Index,48+\OffsetB)(\BREG)          
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8  
-    xvmulsp      vs0,   vs28,   vs9      
-    xvmulsp      vs0,   vs30,   vs10  
-    xvmulsp      vs0,   vs32,   vs11     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8  
-    xvmaddasp      vs0,   vs28,   vs9     
-    xvmaddasp      vs0,   vs30,   vs10  
-    xvmaddasp      vs0,   vs32,   vs11  
- .endif
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP4(\Index,16)  
-    addi        \BREG, \BREG, DISP16(\Index,64)
-.endif 
-.endm
-
-.macro SAVE4x1
-  slwi    T10, LDC ,   1 
-  add     T1, CO, LDC  
-  add     T2, CO, T10  
-  add     T3, T1, T10     
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs4,alpha_r
-/* v0 corresponds to vs32, do not forget*/
-#if !defined(TRMMKERNEL)
-  lxssp  v0,0(CO)  
-  lxssp  v2,0(T1) 
-  lxssp  v4,0(T2) 
-  lxssp  v6,0(T3)  
-#endif
-  xscvspdp  vs24, vs0
-  xxspltw   vs25, vs0, 1 
-  xxspltw   vs26, vs0, 2 
-  xxspltw   vs27, vs0, 3  
-  xscvspdp  vs25,vs25
-  xscvspdp  vs26,vs26
-  xscvspdp  vs27,vs27
-
-#if defined(TRMMKERNEL)
-  xsmuldp  vs32,vs27, vs4 
-  xsmuldp  vs34,vs26, vs4 
-  xsmuldp  vs36,vs25, vs4 
-  xsmuldp  vs38,vs24, vs4  
-#else
-  xsmaddadp  vs32,vs27, vs4 
-  xsmaddadp  vs34,vs26, vs4 
-  xsmaddadp  vs36,vs25, vs4 
-  xsmaddadp  vs38,vs24, vs4   
-#endif  
-  stxssp  v0,0(CO)  
-  stxssp  v2,0(T1) 
-  stxssp  v4,0(T2) 
-  stxssp  v6,0(T3)  
-  addi CO,CO,4
-.endm
-
-/****************************N=2 section*****************/
-
-.macro KERNEL2x16_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL2x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-.macro Zero2x16
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor      vs2,   vs2,   vs2
-    xxlxor      vs3,   vs3,   vs3
-    xxlxor      vs4,   vs4,   vs4
-    xxlxor      vs5,   vs5,   vs5 
-    xxlxor      vs6,   vs6,   vs6
-    xxlxor      vs7,   vs7,   vs7      
-.endm
- 
-.macro KERNEL2x16
-  KERNEL2x16_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1 
-    xxspltw   vs9,  vs36, 0 
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
-    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
-    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8 
-    xvmulsp      vs2,   vs28,   vs8
-    xvmulsp      vs3,   vs29,   vs8 
-
-    xvmulsp      vs4,   vs26,   vs9
-    xvmulsp      vs5,   vs27,   vs9 
-    xvmulsp      vs6,   vs28,   vs9
-    xvmulsp      vs7,   vs29,   vs9     
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
-
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9 
-    xvmaddasp      vs6,   vs28,   vs9
-    xvmaddasp      vs7,   vs29,   vs9
- 
- .endif
-   
-    addi        \BREG, \BREG, DISP2(\Index,8)
-    addi        \AREG, \AREG, DISP16(\Index,64)  
- 
-.endm
-
-
-
-
-.macro KERNEL2x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
-    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
-    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
-
-    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
-    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
-    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
-
-    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
-    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
-    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
-    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
-        
-    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
-    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
-    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
-    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0  
-
-    xxspltw   vs12,  vs39, 3  
-    xxspltw   vs13,  vs39, 2 
-    xxspltw   vs14, vs39, 1 
-    xxspltw   vs15, vs39, 0  
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
-
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9 
-    xvmaddasp      vs6,   vs28,   vs9
-    xvmaddasp      vs7,   vs29,   vs9 
-
-    xvmaddasp      vs0,   vs16,   vs10
-    xvmaddasp      vs1,   vs17,   vs10 
-    xvmaddasp      vs2,   vs18,   vs10
-    xvmaddasp      vs3,   vs19,   vs10 
-
-    xvmaddasp      vs4,   vs16,   vs11
-    xvmaddasp      vs5,   vs17,   vs11 
-    xvmaddasp      vs6,   vs18,   vs11
-    xvmaddasp      vs7,   vs19,   vs11  
-
-    xvmaddasp      vs0,   vs30,   vs12
-    xvmaddasp      vs1,   vs31,   vs12 
-    xvmaddasp      vs2,   vs32,   vs12
-    xvmaddasp      vs3,   vs33,   vs12 
-
-    xvmaddasp      vs4,   vs30,   vs13
-    xvmaddasp      vs5,   vs31,   vs13 
-    xvmaddasp      vs6,   vs32,   vs13
-    xvmaddasp      vs7,   vs33,   vs13 
-
-    xvmaddasp      vs0,   vs34,   vs14
-    xvmaddasp      vs1,   vs35,   vs14 
-    xvmaddasp      vs2,   vs36,   vs14
-    xvmaddasp      vs3,   vs37,   vs14 
-
-    xvmaddasp      vs4,   vs34,   vs15
-    xvmaddasp      vs5,   vs35,   vs15 
-    xvmaddasp      vs6,   vs36,   vs15
-    xvmaddasp      vs7,   vs37,   vs15    
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP8(\Index,32)  
-    addi        \AREG, \AREG, DISP64(\Index,256)
-.endif 
-  
-.endm
-
-.macro KERNEL2x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 3  
-    xxspltw   vs9,  vs36, 2 
-    xxspltw   vs10, vs36, 1 
-    xxspltw   vs11, vs36, 0    
-    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
-    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
-    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
-    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
-    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
-    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
- 
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
-
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9 
-    xvmaddasp      vs6,   vs28,   vs9
-    xvmaddasp      vs7,   vs29,   vs9 
-
-    xvmaddasp      vs0,   vs16,   vs10
-    xvmaddasp      vs1,   vs17,   vs10 
-    xvmaddasp      vs2,   vs18,   vs10
-    xvmaddasp      vs3,   vs19,   vs10 
-
-    xvmaddasp      vs4,   vs16,   vs11
-    xvmaddasp      vs5,   vs17,   vs11 
-    xvmaddasp      vs6,   vs18,   vs11
-    xvmaddasp      vs7,   vs19,   vs11   
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif 
-  
-.endm
-
-
-.macro SAVE2x16
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)
-    lxv        vs17, 16(CO) 
-    lxv        vs18, 32(CO)  
-    lxv        vs19, 48(CO)      
-#endif
-  add     T1, CO, LDC 
-#ifndef TRMMKERNEL    
-    lxv        vs26, 0(T1)
-    lxv        vs27, 16(T1) 
-    lxv        vs28, 32(T1)  
-    lxv        vs29, 48(T1)      
-#endif
-
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r
-  xvmulsp        vs17, vs1, alpha_r 
-  xvmulsp        vs18, vs2, alpha_r
-  xvmulsp        vs19, vs3, alpha_r   
-  xvmulsp        vs26, vs4, alpha_r
-  xvmulsp        vs27, vs5, alpha_r 
-  xvmulsp        vs28, vs6, alpha_r
-  xvmulsp        vs29, vs7, alpha_r
-#else
-  xvmaddasp        vs16, vs0, alpha_r
-  xvmaddasp        vs17, vs1, alpha_r 
-  xvmaddasp        vs18, vs2, alpha_r
-  xvmaddasp        vs19, vs3, alpha_r   
-  xvmaddasp        vs26, vs4, alpha_r
-  xvmaddasp        vs27, vs5, alpha_r 
-  xvmaddasp        vs28, vs6, alpha_r
-  xvmaddasp        vs29, vs7, alpha_r
-#endif
-    stxv        vs16, 0(CO)
-    stxv        vs17, 16(CO) 
-    stxv        vs18, 32(CO)  
-    stxv        vs19, 48(CO)      
-    
-    stxv        vs26, 0(T1)
-    stxv        vs27, 16(T1) 
-    stxv        vs28, 32(T1)  
-    stxv        vs29, 48(T1) 
- 
-  addi CO,CO,64
-
-.endm
-
-/*       M=8 N=2 */
-
-.macro KERNEL2x8_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL2x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-.macro Zero2x8
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
- 
-    xxlxor      vs4,   vs4,   vs4
-    xxlxor      vs5,   vs5,   vs5 
-     
-.endm
- 
-.macro KERNEL2x8
-  KERNEL2x8_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1 
-    xxspltw   vs9,  vs36, 0 
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)          
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8  
-
-    xvmulsp      vs4,   vs26,   vs9
-    xvmulsp      vs5,   vs27,   vs9      
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8   
-
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9  
- 
- .endif
-   
-    addi        \BREG, \BREG, DISP2(\Index,8)
-    addi        \AREG, \AREG, DISP8(\Index,32)  
- 
-.endm
-
-
-
-
-.macro KERNEL2x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
-
-    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
-
-    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)  
-        
-    lxv vs34,   DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
-    lxv vs35,   DISP32(\Index, 96+ 16+\OffsetA)(\AREG) 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0  
-
-    xxspltw   vs12,  vs39, 3  
-    xxspltw   vs13,  vs39, 2 
-    xxspltw   vs14, vs39, 1 
-    xxspltw   vs15, vs39, 0  
-
- 
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9 
- 
-
-    xvmaddasp      vs0,   vs16,   vs10
-    xvmaddasp      vs1,   vs17,   vs10 
-    xvmaddasp      vs4,   vs16,   vs11
-    xvmaddasp      vs5,   vs17,   vs11 
- 
-
-    xvmaddasp      vs0,   vs30,   vs12
-    xvmaddasp      vs1,   vs31,   vs12 
-    xvmaddasp      vs4,   vs30,   vs13
-    xvmaddasp      vs5,   vs31,   vs13 
-
-    xvmaddasp      vs0,   vs34,   vs14
-    xvmaddasp      vs1,   vs35,   vs14 
-    xvmaddasp      vs4,   vs34,   vs15
-    xvmaddasp      vs5,   vs35,   vs15 
-   
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP8(\Index,32)  
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif 
-  
-.endm
-
-.macro KERNEL2x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 3  
-    xxspltw   vs9,  vs36, 2 
-    xxspltw   vs10, vs36, 1 
-    xxspltw   vs11, vs36, 0    
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)
-    lxv vs16,   DISP16(\Index,32+\OffsetA)(\AREG)
-    lxv vs17,   DISP16(\Index,48+\OffsetA)(\AREG)      
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
-
-    xvmaddasp      vs4,   vs26,   vs9
-    xvmaddasp      vs5,   vs27,   vs9  
-
-    xvmaddasp      vs0,   vs16,   vs10
-    xvmaddasp      vs1,   vs17,   vs10   
-
-    xvmaddasp      vs4,   vs16,   vs11
-    xvmaddasp      vs5,   vs17,   vs11     
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP16(\Index,64)
-.endif 
-  
-.endm
-
-
-.macro SAVE2x8
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)
-    lxv        vs17, 16(CO)     
-#endif
-  add     T1, CO, LDC 
-#ifndef TRMMKERNEL    
-    lxv        vs26, 0(T1)
-    lxv        vs27, 16(T1) 
-    
-#endif
-
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r
-  xvmulsp        vs17, vs1, alpha_r  
-  xvmulsp        vs26, vs4, alpha_r
-  xvmulsp        vs27, vs5, alpha_r 
-#else
-  xvmaddasp        vs16, vs0, alpha_r
-  xvmaddasp        vs17, vs1, alpha_r  
-  xvmaddasp        vs26, vs4, alpha_r
-  xvmaddasp        vs27, vs5, alpha_r 
-#endif
-
-    stxv        vs16, 0(CO)
-    stxv        vs17, 16(CO) 
-     
-    
-    stxv        vs26, 0(T1)
-    stxv        vs27, 16(T1) 
-
-  addi CO,CO,32
-
-.endm
-
-
-/*M=4*/
-
-
-.macro KERNEL2x4_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL2x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- /* we will aggregate on save vs0 +vs4 vs11+vs5 */
-.macro Zero2x4
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
- 
-    xxlxor      vs4,   vs4,   vs4
-    xxlxor      vs5,   vs5,   vs5 
-    
-.endm
- 
-.macro KERNEL2x4
-  KERNEL2x4_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1 
-    xxspltw   vs9,  vs36, 0 
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)        
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8 
-    xvmulsp      vs1,   vs26,   vs9     
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8 
-    xvmaddasp      vs1,   vs26,   vs9 
- .endif
-   
-    addi        \BREG, \BREG, DISP2(\Index,8)
-    addi        \AREG, \AREG, DISP4(\Index,16)  
- 
-.endm
-
-
-
-
-.macro KERNEL2x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs16,   DISP16(\Index,16+\OffsetA)(\AREG) 
-
-    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
-    lxv vs34,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
- 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0  
-
-    xxspltw   vs12,  vs39, 3  
-    xxspltw   vs13,  vs39, 2 
-    xxspltw   vs14, vs39, 1 
-    xxspltw   vs15, vs39, 0  
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs26,   vs9 
-    xvmaddasp      vs4,   vs16,   vs10
-    xvmaddasp      vs5,   vs16,   vs11 
- 
-
-    xvmaddasp      vs0,   vs30,   vs12
-    xvmaddasp      vs1,   vs30,   vs13 
-    xvmaddasp      vs4,   vs34,   vs14
-    xvmaddasp      vs5,   vs34,   vs15 
- 
-   
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP8(\Index,32)  
-    addi        \AREG, \AREG, DISP16(\Index,64)
-.endif 
-  
-.endm
-
-.macro KERNEL2x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 3  
-    xxspltw   vs9,  vs36, 2 
-    xxspltw   vs10, vs36, 1 
-    xxspltw   vs11, vs36, 0    
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\AREG)      
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs26,   vs9 
-    xvmaddasp      vs4,   vs16,   vs10
-    xvmaddasp      vs5,   vs16,   vs11     
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP8(\Index,32)
-.endif 
-  
-.endm
-
-
-.macro SAVE2x4
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)     
-#endif
-  add     T1, CO, LDC 
-#ifndef TRMMKERNEL    
-    lxv        vs26, 0(T1) 
-    
-#endif
-    /*aggregate vectors*/
-  xvaddsp         vs0,vs0,vs4
-  xvaddsp         vs1,vs1,vs5 
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r 
-  xvmulsp        vs26, vs1, alpha_r 
-#else
-  xvmaddasp        vs16, vs0, alpha_r 
-  xvmaddasp        vs26, vs1, alpha_r 
-#endif
-
-  stxv        vs16, 0(CO) 
-  stxv        vs26, 0(T1)  
-
-  addi CO,CO,16
-
-.endm
-
-
-/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2  */
-.macro SWITCH_PERMUTE_INNER
-    xxpermdi	permute_mask,	permute_mask,	permute_mask,2
-.endm
-
-.macro Zero2x2
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    SWITCH_PERMUTE_INNER
-.endm
- 
-.macro KERNEL2x2
-  KERNEL2x2_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxperm   vs9,  vs36, permute_mask 
-    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs37,   vs36 
-    xvmulsp      vs1,   vs37,   vs9     
-     
-.else 
-    xvmaddasp      vs0,   vs37,   vs36 
-    xvmaddasp      vs1,   vs37,   vs9 
- .endif
-   
-    addi        \BREG, \BREG, DISP2(\Index,8)
-    addi        \AREG, \AREG, DISP2(\Index,8)  
- 
-.endm
-
-
-
-
-.macro KERNEL2x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs10,    DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs16,   DISP8(\Index,16+\OffsetA)(\AREG) 
-
- 
-    xxperm   vs9,  vs8, permute_mask   
-    xxperm   vs11, vs10, permute_mask  
-
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs26,   vs9 
-    xvmaddasp      vs0,   vs16,   vs10
-    xvmaddasp      vs1,   vs16,   vs11 
- 
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP8(\Index,32)  
-    addi        \AREG, \AREG, DISP8(\Index,32)
-.endif 
-  
-.endm
-
-.macro KERNEL2x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\BREG)  
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG) 
-
- 
-    xxperm   vs9,  vs8, permute_mask    
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs26,   vs9  
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP4(\Index,16)
-.endif 
-.endm
-
-
-.macro SAVE2x2
-
-#ifndef TRMMKERNEL    
-    lxsd v4   , 0(CO)     
-#endif
-  add     T1, CO, LDC 
-#ifndef TRMMKERNEL    
-    lxsd v5   , 0(T1) 
-    
-#endif
-    /*aggregate vectors*/
-  xxpermdi         vs4,vs0,vs0,2
-  xxpermdi         vs5,vs1,vs1,2  
-  xvaddsp          vs0,vs0,vs4
-  xvaddsp         vs1,vs1,vs5 
-  /*   */
-  /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10}  */
-  xxperm    vs1,vs1, permute_mask
-
-
-  xxmrghw   vs2 ,vs1,vs0
-  xxpermdi         vs2,vs2,vs2,2  
-  xxmrghw   vs3 ,vs0,vs1  
-#if defined(TRMMKERNEL)
-  xvmulsp        vs36, vs2, alpha_r 
-  xvmulsp        vs37, vs3, alpha_r 
-#else
-  xvmaddasp        vs36, vs2, alpha_r 
-  xvmaddasp        vs37, vs3, alpha_r 
-#endif
-  /**** store last two words*/
-
-
-  stxsd       v4, 0(CO) 
-  stxsd        v5, 0(T1)  
-
-  addi CO,CO,8
-
-.endm
-
-/*--------------------------- M=1 N=2 */
-.macro Zero2x1
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor    vs2,vs2,vs2 
-    xxlxor    vs3,vs3,vs3     
-.endm
- 
-.macro KERNEL2x1
-  KERNEL2x1_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast  
-  KERNEL2x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- /*
-   we will calculate 1 alone then will add it to batched ones
- */
-.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\BREG)
-    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\BREG) 
-    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\AREG)        
- 
- 
-.if \First==1
-    xvmulsp      vs2,   vs37,   vs35 
-    xvmulsp      vs3,   vs37,   vs36     
-     
-.else 
-    xsmaddadp     vs2,   vs37,   vs35
-    xsmaddadp      vs3,   vs37,   vs36
- .endif
-   
-    addi        \BREG, \BREG, DISP2(\Index,8)
-    addi        \AREG, \AREG, DISP1(\Index,4)  
- 
-.endm
-
-
-
-
-.macro KERNEL2x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
-    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)
-   
-    xxmrglw   vs5, vs26,vs26
-    xxmrghw   vs6, vs26,vs26 
- 
-    xvmaddasp      vs0,   vs8,   vs5
-    xvmaddasp      vs1,   vs10,   vs6 
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP8(\Index,32)  
-    addi        \AREG, \AREG, DISP4(\Index,16)
-.endif 
-  
-.endm
-
-.macro KERNEL2x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\BREG)
-    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\BREG) 
-    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\BREG)
-    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\BREG)    
-    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
-    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\AREG)  
- 
- 
-    xsmaddadp      vs2,   vs37,   vs35
-    xsmaddadp      vs3,   vs37,   vs36
-
-    xsmaddadp      vs2,   vs38,   vs39 
-    xsmaddadp      vs3,   vs38,   vs40      
- 
-   
-    addi        \BREG, \BREG, DISP4(\Index,16)
-    addi        \AREG, \AREG, DISP2(\Index,8) 
-.endm
-
-
-.macro SAVE2x1
-
-#ifndef TRMMKERNEL    
-    lxssp v4   , 0(CO)     
-#endif
-  add     T1, CO, LDC 
-#ifndef TRMMKERNEL    
-    lxssp v5   , 0(T1) 
-    
-#endif
-
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs16,alpha_r
-
- /*aggregate vectors 2x2_4   */ 
-      xxpermdi         vs4,vs0,vs0,2
-      xxpermdi         vs5,vs1,vs1,2  
-      xvaddsp          vs0,vs0,vs4
-      xvaddsp         vs1,vs1,vs5 
-      xvaddsp         vs0,vs0,vs1 
-/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
-  xscvspdp  vs5, vs0
-  xxspltw   vs6, vs0, 1  
-  xscvspdp  vs6,vs6 
-  xsadddp  vs2,vs2,vs6
-  xsadddp  vs3,vs3,vs5  
-
-  /**** store last two words*/
-#if defined(TRMMKERNEL) 
-  xsmuldp  vs36,vs2, vs16 
-  xsmuldp  vs37,vs3, vs16  
- 
-#else
-  xsmaddadp  vs36,vs2, vs16 
-  xsmaddadp  vs37,vs3, vs16 
-#endif  
-
-  stxssp       v4, 0(CO) 
-  stxssp        v5, 0(T1)  
-
-  addi CO,CO,4
-
-.endm
-
-
-
-/****************************N=1 section*****************/
-
-.macro KERNEL1x16_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL1x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-.macro Zero1x16
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor      vs2,   vs2,   vs2
-    xxlxor      vs3,   vs3,   vs3       
-.endm
- 
-.macro KERNEL1x16
-  KERNEL1x16_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
-    xscvdpspn   vs36,vs36
-    xxspltw     vs8,  vs36, 0
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
-    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
-    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8 
-    xvmulsp      vs2,   vs28,   vs8
-    xvmulsp      vs3,   vs29,   vs8 
-  
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
- 
- .endif
-   
-    addi        \BREG, \BREG, DISP1(\Index,4)
-    addi        \AREG, \AREG, DISP16(\Index,64)  
- 
-.endm
-
-
-
-
-.macro KERNEL1x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
-    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
-    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
-
-    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
-    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
-    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-
-    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
-    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
-    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
-    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
-        
-    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
-    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
-    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
-    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
-
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0    
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
- 
-
-    xvmaddasp      vs0,   vs16,   vs9
-    xvmaddasp      vs1,   vs17,   vs9 
-    xvmaddasp      vs2,   vs18,   vs9
-    xvmaddasp      vs3,   vs19,   vs9 
- 
-
-    xvmaddasp      vs0,   vs30,   vs10
-    xvmaddasp      vs1,   vs31,   vs10 
-    xvmaddasp      vs2,   vs32,   vs10
-    xvmaddasp      vs3,   vs33,   vs10 
- 
-
-    xvmaddasp      vs0,   vs34,   vs11
-    xvmaddasp      vs1,   vs35,   vs11 
-    xvmaddasp      vs2,   vs36,   vs11
-    xvmaddasp      vs3,   vs37,   vs11 
-
- 
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP64(\Index,256)
-.endif 
-  
-.endm
-
-.macro KERNEL1x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1  
-    xxspltw   vs9,  vs36, 0      
-    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
-    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
-    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
-    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
-    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
-    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
- 
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8 
-    xvmaddasp      vs2,   vs28,   vs8
-    xvmaddasp      vs3,   vs29,   vs8 
- 
-
-    xvmaddasp      vs0,   vs16,   vs9
-    xvmaddasp      vs1,   vs17,   vs9 
-    xvmaddasp      vs2,   vs18,   vs9
-    xvmaddasp      vs3,   vs19,   vs9 
-  
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP2(\Index,8)  
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif 
-  
-.endm
-
-
-.macro SAVE1x16
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)
-    lxv        vs17, 16(CO) 
-    lxv        vs18, 32(CO)  
-    lxv        vs19, 48(CO)      
-#endif
- 
-
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r
-  xvmulsp        vs17, vs1, alpha_r 
-  xvmulsp        vs18, vs2, alpha_r
-  xvmulsp        vs19, vs3, alpha_r   
-#else
-  xvmaddasp        vs16, vs0, alpha_r
-  xvmaddasp        vs17, vs1, alpha_r 
-  xvmaddasp        vs18, vs2, alpha_r
-  xvmaddasp        vs19, vs3, alpha_r   
-#endif
-    stxv        vs16, 0(CO)
-    stxv        vs17, 16(CO) 
-    stxv        vs18, 32(CO)  
-    stxv        vs19, 48(CO)      
-    
-  addi CO,CO,64
-
-.endm
-
-/*       M=8 N=1 */
-
-.macro KERNEL1x8_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL1x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-.macro Zero1x8
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1  
-    xxlxor      vs2,   vs2,   vs2
-    xxlxor      vs3,   vs3,   vs3          
-.endm
- 
-.macro KERNEL1x8
-  KERNEL1x8_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
-    xscvdpspn   vs36,vs36
-    xxspltw     vs8,  vs36, 0
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)         
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8
-    xvmulsp      vs1,   vs27,   vs8  
-  
-     
-.else 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
- 
- .endif
-   
-    addi        \BREG, \BREG, DISP1(\Index,4)
-    addi        \AREG, \AREG, DISP8(\Index,32)  
- 
-.endm
-
-
-
-
-.macro KERNEL1x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
-
-    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-
-    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
-    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)   
-        
-    lxv vs34,   DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
-    lxv vs35,   DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)  
-
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0    
-
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
- 
-
-    xvmaddasp      vs2,   vs16,   vs9
-    xvmaddasp      vs3,   vs17,   vs9  
- 
-
-    xvmaddasp      vs0,   vs30,   vs10
-    xvmaddasp      vs1,   vs31,   vs10  
- 
-
-    xvmaddasp      vs2,   vs34,   vs11
-    xvmaddasp      vs3,   vs35,   vs11  
-
- 
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP32(\Index,128)
-.endif 
-  
-.endm
-
-.macro KERNEL1x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1  
-    xxspltw   vs9,  vs36, 0      
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)     
-    lxv vs16,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
-    lxv vs17,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
- 
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs8  
- 
-
-    xvmaddasp      vs2,   vs16,   vs9
-    xvmaddasp      vs3,   vs17,   vs9   
-  
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP2(\Index,8)  
-    addi        \AREG, \AREG, DISP16(\Index,64)
-.endif 
-  
-.endm
-
-
-.macro SAVE1x8
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)
-    lxv        vs17, 16(CO)       
-#endif
-   /* aggregate vs0 vs2 and vs1 vs3*/
-  xvaddsp vs0,vs0,vs2
-  xvaddsp  vs1,vs1,vs3
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r
-  xvmulsp        vs17, vs1, alpha_r     
-#else
-  xvmaddasp        vs16, vs0, alpha_r
-  xvmaddasp        vs17, vs1, alpha_r  
-#endif
-    stxv        vs16, 0(CO)
-    stxv        vs17, 16(CO)      
-    
-  addi CO,CO,32
-
-.endm
-/*M=4*/
-
-.macro KERNEL1x4_2   OffsetA,OffsetB, Index,IsLast
-  KERNEL1x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
- 
-.macro Zero1x4
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1  
-    xxlxor      vs2,   vs2,   vs2
-    xxlxor      vs3,   vs3,   vs3          
-.endm
- 
-.macro KERNEL1x4
-  KERNEL1x4_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- 
-.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
-    xscvdpspn   vs36,vs36
-    xxspltw     vs8,  vs36, 0
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)         
- 
- 
-.if \First==1
-    xvmulsp      vs0,   vs26,   vs8 
-.else 
-    xvmaddasp      vs0,   vs26,   vs8 
- 
- .endif
-   
-    addi        \BREG, \BREG, DISP1(\Index,4)
-    addi        \AREG, \AREG, DISP4(\Index,16)  
- 
-.endm
-
-
-
-
-.macro KERNEL1x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
-
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG) 
- 
-
-    xxspltw   vs8,  vs38, 3  
-    xxspltw   vs9,  vs38, 2 
-
-    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
-    lxv vs31,   DISP16(\Index,32+ 16+\OffsetA)(\AREG)   
-          
-
-    xxspltw   vs10, vs38, 1 
-    xxspltw   vs11, vs38, 0    
-
- 
-    xvmaddasp      vs0,   vs26,   vs8 
-
-    xvmaddasp      vs1,   vs27,   vs9 
-
-    xvmaddasp      vs2,   vs30,   vs10   
- 
-
-    xvmaddasp      vs3,   vs31,   vs11   
-
- 
- 
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP4(\Index,16)  
-    addi        \AREG, \AREG, DISP16(\Index,64)
-.endif 
-  
-.endm
-
-.macro KERNEL1x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
-    xxspltw   vs8,  vs36, 1  
-    xxspltw   vs9,  vs36, 0      
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
-    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)      
- 
- 
-    xvmaddasp      vs0,   vs26,   vs8
-    xvmaddasp      vs1,   vs27,   vs9
-  
- 
-.if \IsLast==1   
-    addi        \BREG, \BREG, DISP2(\Index,8)  
-    addi        \AREG, \AREG, DISP8(\Index,32)
-.endif 
-  
-.endm
-
-
-.macro SAVE1x4
-
-#ifndef TRMMKERNEL    
-    lxv        vs16, 0(CO)       
-#endif
-   /* aggregate */
-  xvaddsp vs0,vs0,vs2
-  xvaddsp  vs1,vs1,vs3
-  xvaddsp  vs0,vs1,vs0
-#if defined(TRMMKERNEL)
-  xvmulsp        vs16, vs0, alpha_r     
-#else
-  xvmaddasp        vs16, vs0, alpha_r  
-#endif
-    stxv        vs16, 0(CO)      
-    
-  addi CO,CO,16
-
-.endm
-
-/* M=2 N=1*/ 
-.macro Zero1x2
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor    vs2,vs2,vs2 
-    xxlxor    vs3,vs3,vs3     
-.endm
- 
-.macro KERNEL1x2
-  KERNEL1x2_1 AO,BO, 0, 0,0,0
-.endm
-.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- /*
-   we will calculate 1 alone then will add it to batched ones
- */
-.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\AREG)
-    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\AREG) 
-    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
- 
- 
-.if \First==1
-    xvmuldp      vs2,   vs37,   vs35 
-    xvmuldp      vs3,   vs37,   vs36     
-     
-.else 
-    xsmaddadp     vs2,   vs37,   vs35
-    xsmaddadp      vs3,   vs37,   vs36
- .endif
-   
-    addi        \AREG, \AREG,  DISP2(\Index,8) 
-    addi        \BREG, \BREG, DISP1(\Index,4) 
- 
-.endm
-
-
-
-
-.macro KERNEL1x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG)
-    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\AREG) 
-
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG)
-   
-    xxmrglw   vs5, vs26,vs26
-    xxmrghw   vs6, vs26,vs26 
- 
-    xvmaddasp      vs0,   vs8,   vs5
-    xvmaddasp      vs1,   vs10,   vs6 
- 
- 
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP8(\Index,32)
-    addi        \BREG, \BREG,  DISP4(\Index,16)  
-.endif 
-  
-.endm
-
-.macro KERNEL1x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\AREG)
-    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\AREG) 
-    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\AREG)
-    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\AREG)    
-    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\BREG)        
-    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\BREG)  
- 
- 
-    xsmaddadp      vs2,   vs37,   vs35
-    xsmaddadp      vs3,   vs37,   vs36
-
-    xsmaddadp      vs2,   vs38,   vs39 
-    xsmaddadp      vs3,   vs38,   vs40      
- 
-   
-    addi        \AREG, \AREG, DISP4(\Index,16)
-    addi        \BREG, \BREG, DISP2(\Index,8) 
-.endm
-
-
-.macro SAVE1x2
-
-#ifndef TRMMKERNEL    
-    lxssp v4   , 0(CO)      
-    lxssp v5   , 4(CO) 
-    
-#endif
-
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs16,alpha_r
-
- /*aggregate vectors 1x2_4   */ 
-      xxpermdi         vs4,vs0,vs0,2
-      xxpermdi         vs5,vs1,vs1,2  
-      xvaddsp          vs0,vs0,vs4
-      xvaddsp         vs1,vs1,vs5 
-      xvaddsp         vs0,vs0,vs1 
-/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
-  xscvspdp  vs5, vs0
-  xxspltw   vs6, vs0, 1  
-  xscvspdp  vs6,vs6 
-  xsadddp  vs2,vs2,vs6
-  xsadddp  vs3,vs3,vs5  
-
-  /**** store last two words*/
-#if defined(TRMMKERNEL) 
-  xsmuldp  vs36,vs2, vs16 
-  xsmuldp  vs37,vs3, vs16  
- 
-#else
-  xsmaddadp  vs36,vs2, vs16 
-  xsmaddadp  vs37,vs3, vs16 
-#endif  
-
-  stxssp       v4, 0(CO) 
-  stxssp        v5, 4(CO)  
-
-  addi CO,CO,8
-
-.endm
-/*///////////////// N=1 M=1 //////////////////*/
-.macro Zero1x1
-    xxlxor      vs0,   vs0,   vs0
-    xxlxor      vs1,   vs1,   vs1 
-    xxlxor      vs2, vs2,vs2 
-    xxlxor      vs3,vs3,vs3 
-    xxlxor      vs4,vs4,vs4       
-.endm
- 
-.macro KERNEL1x1
-  KERNEL1x1_1 AO,BO, 1, 0,0,0
-.endm
-
-.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x1_I_16 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x1_I_8 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
-
-.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast  
-  KERNEL1x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
-.endm
- /*
-   we will calculate 1 alone ( FIRST==1 to zero vs4) 
- */
-.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
-    lxssp v3,   DISP1(\Index, 0+\OffsetB)(\AREG) 
-    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
- 
- 
-.if \First==1
-    xvmuldp      vs4,   vs37,   vs35       
-     
-.else 
-    xsmaddadp     vs4,   vs37,   vs35 
- .endif
-   
-    addi        \AREG, \AREG,  DISP1(\Index,4) 
-    addi        \BREG, \BREG, DISP1(\Index,4) 
- 
-.endm
-
-
-.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\AREG) 
-    lxv vs9,    DISP16(\Index, 16+\OffsetB)(\AREG) 
-    lxv vs10,   DISP16(\Index, 32+0+\OffsetB)(\AREG) 
-    lxv vs11,   DISP16(\Index, 32+ 16+\OffsetB)(\AREG)        
-    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\BREG) 
-    lxv vs16,   DISP16(\Index, 16+\OffsetA)(\BREG) 
-    lxv vs17,   DISP16(\Index, 32+0+\OffsetA)(\BREG) 
-    lxv vs18,   DISP16(\Index, 32+16+\OffsetA)(\BREG)     
-    xvmaddasp      vs0,   vs8,   vs26 
-    xvmaddasp      vs1,   vs9,   vs16  
-    xvmaddasp      vs2,   vs10,  vs17 
-    xvmaddasp      vs3,   vs11,  vs18
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP16(\Index,64)
-    addi        \BREG, \BREG,  DISP16(\Index,64)  
-.endif 
-  
-.endm
-
-.macro KERNEL1x1_I_8  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG) 
-    lxv vs9,    DISP8(\Index, 16+\OffsetB)(\AREG)     
-    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\BREG) 
-    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\BREG) 
-    xvmaddasp      vs0,   vs8,   vs26 
-    xvmaddasp      vs1,   vs9,   vs16 
- 
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP8(\Index,32)
-    addi        \BREG, \BREG,  DISP8(\Index,32)  
-.endif 
-  
-.endm
-
-
-.macro KERNEL1x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\AREG) 
-    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG) 
- 
-    xvmaddasp      vs0,   vs8,   vs26 
- 
- 
-.if \IsLast==1   
-    addi        \AREG, \AREG, DISP4(\Index,16)
-    addi        \BREG, \BREG,  DISP4(\Index,16)  
-.endif 
-  
-.endm
-
-.macro KERNEL1x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
-
-    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\AREG) 
-    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\BREG) 
- 
-    xvmaddasp      vs0,   vs36,   vs37 
- 
-    addi        \AREG, \AREG, DISP2(\Index,8)
-    addi        \BREG, \BREG, DISP2(\Index,8) 
-.endm
-
-
-.macro SAVE1x1
-
-#ifndef TRMMKERNEL    
-    lxssp v4   , 0(CO)    
-    
-#endif
-
-  /*convert alpha_r for multiply*/
-  xscvspdp  vs16,alpha_r
-
- /*aggregate vectors   */ 
-      xvaddsp          vs0,vs0,vs1
-      xvaddsp          vs2,vs2,vs3
-      xvaddsp          vs0,vs0,vs2
-
-      xxpermdi         vs7,vs0,vs0,2   
-      xvaddsp          vs0,vs0,vs7 
-/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
-  xscvspdp  vs5, vs0
-  xxspltw   vs6, vs0, 1  
-  xscvspdp  vs6,vs6 
-  xsadddp  vs7,vs5,vs6
-  xsadddp  vs4,vs4,vs7  
-
-  /**** store last two words*/
-#if defined(TRMMKERNEL) 
-  xsmuldp  vs36,vs4, vs16   
- 
-#else
-  xsmaddadp  vs36,vs4, vs16   
-#endif  
-
-  stxssp       v4, 0(CO)    
-
-  addi CO,CO,4
-
-.endm
-
-
-
-
-/****************************TRMM POINTER REFRESH MACROSES*************************/
-
-.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
-		.if \SHIFT_VAL==16 
-			slwi		\REG1,	\REG2,	6			
-		.elseif \SHIFT_VAL==8  
-			slwi		\REG1,	\REG2,	5			 
-		.elseif \SHIFT_VAL==4
-			slwi		\REG1,	\REG2,	4			  
-		.elseif \SHIFT_VAL==2
-			slwi		\REG1,	\REG2,	3			 
-		.elseif \SHIFT_VAL==1
-			slwi		\REG1,	\REG2,	2			 
-		.endif
-.endm
-
-/*
-//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		ptrbb = bb;
-// #else
-// 		ptrba += off*16;
-// 		ptrbb = bb + off*2;
-// #endif
-*/
-.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
-    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
-        /* ptrbb = bb;*/
-        mr \PTR_B,\B_VAL     /* refresh BPOINT */
-
-    #else
-		    /*
-        // ptrba  =ptrba+ off*C_A;
-        // ptrbb = bb + off*C_B; 
-				*/
-		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
-		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
-		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
-		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
-    #endif 
-.endm
-
-
-/*
-// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-// 		temp = bk-off;
-// #elif defined(LEFT)
-// 		temp = off+16;	// number of values in A
-// #else
-// 		temp = off+2;	// number of values in B
-// #endif
-*/
-.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
-    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
-                            /* temp = bk-off;*/
-           sub \TEMP_BK,\BK_VAL,\OFF_VAL
-
-    #elif defined(LEFT)
-                            /* temp = off+INCR_A;	// number of values in A */
-           addi \TEMP_BK, \OFF_VAL, \INCR_A
-    #else
-                            /* temp = off+INCR_B	// number of values in B*/
-           addi \TEMP_BK,\OFF_VAL, \INCR_B
-    #endif
-
-.endm
-/*
-// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		temp = bk - off;
-// #ifdef LEFT
-// 		temp -= 16; // number of values in A
-// #else
-// 		temp -= 2; // number of values in B
-// #endif
-// 		ptrba += temp*16;
-// 		ptrbb += temp*2;
-// #endif
-
-// #ifdef LEFT
-// 		off += 16; // number of values in A
-// #endif
-*/
- 
-
-.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
-
-    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-                    /*temp = bk - off;*/
-                sub \TEMP_BK,\BK_VAL,\OFF_VAL
-    #ifdef LEFT
-                    /*temp -= 8; // number of values in A*/
-                addi \TEMP_BK,\TEMP_BK,-\C_A
-    #else
-                    /*temp -= 4; // number of values in B*/
-                addi \TEMP_BK,\TEMP_BK,-\C_B 
-    #endif
-                    /*ptrba += temp*C_A;
-                    ptrbb += temp*C_B;*/ 
-                SHIFT_REG T4,\TEMP_BK,\C_A
-								SHIFT_REG T2,\TEMP_BK,\C_B
-                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
-								add \PTR_B, \PTR_B,T2 
-
-    #endif
-
-    #ifdef LEFT
-                    /*off += 8; // number of values in A*/
-                 addi \OFF_VAL,\OFF_VAL,\C_A
-    #endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+ 
+#define unit_size 4
+#define DISP64(ind,disp) (ind*unit_size*64+disp)
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+
+/**********************************************************************************************
+* Macros for N=8 and M=16
+**********************************************************************************************/
+
+ 
+
+.macro KERNEL8x16_L1_L4  Index,IsLast
+  KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+ 
+.macro KERNEL8x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X16
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+	xxlxor		vs48,	vs48,	vs48
+	xxlxor		vs49,	vs49,	vs49
+	xxlxor		vs50,	vs50,	vs50
+	xxlxor		vs51,	vs51,	vs51 
+	xxlxor		vs52,	vs52,	vs52
+	xxlxor		vs53,	vs53,	vs53
+	xxlxor		vs54,	vs54,	vs54
+	xxlxor		vs55,	vs55,	vs55 
+	xxlxor		vs56,	vs56,	vs56
+	xxlxor		vs57,	vs57,	vs57
+	xxlxor		vs58,	vs58,	vs58
+	xxlxor		vs59,	vs59,	vs59 
+	xxlxor		vs60,	vs60,	vs60
+	xxlxor		vs61,	vs61,	vs61
+	xxlxor		vs62,	vs62,	vs62
+	xxlxor		vs63,	vs63,	vs63	
+.endm
+
+.macro LOAD8x16  OffsetA,OffsetB
+
+	lxv	vs24,	(\OffsetB+0)(BO)
+	lxv	vs28,	(\OffsetB+16)(BO)
+	xxperm  	vs26,	vs24,		permute_mask
+	xxperm  	vs30,	vs28,		permute_mask	  
+	lxv	vs0,	(\OffsetA+0)(AO)
+	lxv	vs1,	(\OffsetA+16)(AO)
+	xxpermdi	vs25,	vs24,	vs24,2	   
+	xxpermdi	vs29,	vs28,	vs28,2	  
+	lxv	vs2,	(\OffsetA+32)(AO)
+	lxv	vs3,	(\OffsetA+48)(AO) 
+	xxpermdi	vs27,	vs26,	vs26,2	
+	xxpermdi	vs31,	vs30,	vs30,2	 	
+
+.endm
+
+.macro END8x16_NORMAL
+  END8x16 0, AO, BO, 64,32 
+.endm
+
+.macro END8x16_WITHOUT_ADD
+	END8x16 0, AO,BO,0,0
+.endm
+
+.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+    xvmulsp     vs34, vs2,vs24  
+    xvmulsp     vs35, vs3,vs24  
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+    xvmulsp     vs38, vs2,vs25  
+    xvmulsp     vs39, vs3,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+    xvmulsp     vs42, vs2,vs26  
+    xvmulsp     vs43, vs3,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+    xvmulsp     vs46, vs2,vs27  
+    xvmulsp     vs47, vs3,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+    xvmulsp     vs50, vs2,vs28  
+    xvmulsp     vs51, vs3,vs28  
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+    xvmulsp     vs54, vs2,vs29  
+    xvmulsp     vs55, vs3,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+    xvmulsp     vs58, vs2,vs30  
+    xvmulsp     vs59, vs3,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+    xvmulsp     vs62, vs2,vs31  
+    xvmulsp     vs63, vs3,vs31
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+    xvmaddasp       vs50, vs2,vs28  
+    xvmaddasp       vs51, vs3,vs28  
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    xvmaddasp       vs54, vs2,vs29  
+    xvmaddasp       vs55, vs3,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+    xvmaddasp       vs58, vs2,vs30  
+    xvmaddasp       vs59, vs3,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+    xvmaddasp       vs62, vs2,vs31  
+    xvmaddasp       vs63, vs3,vs31 
+
+.endif
+.endm  
+
+.macro KERNEL8x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+KERNEL8x16_2  \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
+KERNEL8x16_2  \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
+
+.endm
+
+.macro KERNEL8x16 First
+
+  LOAD8x16 0,0
+  END8x16 \First, AO, BO, 64,32 
+.endm
+
+.macro LOAD8x16_2
+    LOAD8x16_2O AO,BO, 0,0
+.endm	
+
+.macro LOAD8x16_2O  AREG,BREG, OffsetA,OffsetB
+  lxv	vs8,	(\OffsetB)(\BREG)
+  lxv	vs12,	(16+\OffsetB)(\BREG)
+  lxv	vs24,	(32+\OffsetB)(\BREG)
+  lxv	vs28,	(32+16+\OffsetB)(\BREG)
+  lxv	vs4,	(0+\OffsetA)(\AREG)
+  lxv	vs5,	(16+\OffsetA)(\AREG)
+  xxperm  	vs10,	vs8,		permute_mask
+  xxperm  	vs14,	vs12,		permute_mask	
+  lxv	vs6,	(32+\OffsetA)(\AREG)
+  lxv	vs7,	(48+\OffsetA)(\AREG) 
+  xxpermdi	vs9,	vs8,	 vs8,2	 
+  xxpermdi	vs13,	vs12,	vs12,2	 
+  lxv	vs0,	(64+\OffsetA)(\AREG)
+  lxv	vs1,	(64+16+\OffsetA)(\AREG) 
+  xxpermdi	vs11,	vs10,	vs10,2	
+  xxpermdi	vs15,	vs14,	vs14,2	
+  lxv	vs2,	(64+32+\OffsetA)(\AREG)
+  lxv	vs3,	(64+48+\OffsetA)(\AREG)
+
+  xxperm  	vs26,	vs24,	permute_mask
+  xxperm  	vs30,	vs28,	permute_mask	
+  xxpermdi	vs25,	vs24,	vs24,2 
+  xxpermdi	vs29,	vs28,	vs28,2	      
+  xxpermdi	vs27,	vs26,	vs26,2	
+  xxpermdi	vs31,	vs30,	vs30,2	 
+.endm
+
+.macro END8x16_2	  
+  /*for load2 offset will be 128 and 64*/
+   KERNEL8x16_2	AO,BO,	128,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL8x16_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL8x16_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL8x16_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL8x16_2  AREG,BREG,  OffsetA,OffsetB, Index,IsLast ,Complete
+  xvmaddasp		vs32, vs4,vs8
+  xvmaddasp		vs33, vs5,vs8
+  xvmaddasp		vs48, vs4,vs12
+  xvmaddasp		vs49, vs5,vs12
+
+  xvmaddasp		vs40, vs4,vs10
+  xvmaddasp		vs41, vs5,vs10
+  xvmaddasp		vs56, vs4,vs14
+  xvmaddasp		vs57, vs5,vs14
+
+  xvmaddasp		vs36, vs4,vs9
+  xvmaddasp		vs37, vs5,vs9
+  xvmaddasp		vs52, vs4,vs13
+  xvmaddasp		vs53, vs5,vs13
+
+  xvmaddasp		vs44, vs4,vs11
+  xvmaddasp		vs45, vs5,vs11
+  xvmaddasp		vs60, vs4,vs15
+  xvmaddasp		vs61, vs5,vs15
+
+.if \Complete==0	
+   lxv	vs4,	DISP32(\Index,0+\OffsetA)(\AREG)
+   lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
+.endif
+
+  xvmaddasp		vs34, vs6,vs8	
+  xvmaddasp		vs35, vs7,vs8	
+  xvmaddasp		vs50, vs6,vs12
+  xvmaddasp		vs51, vs7,vs12
+.if \Complete==0  
+  lxv vs8,  DISP16(\Index,\OffsetB)(\BREG)
+  lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
+.endif    
+  xvmaddasp		vs42, vs6,vs10
+  xvmaddasp		vs43, vs7,vs10
+  xvmaddasp		vs58, vs6,vs14
+  xvmaddasp		vs59, vs7,vs14
+.if \Complete==0  
+  xxperm    vs10, vs8,    permute_mask
+  xxperm    vs14, vs12,   permute_mask    
+.endif    
+  xvmaddasp		vs38, vs6,vs9	
+  xvmaddasp		vs39, vs7,vs9	
+  xvmaddasp   vs54, vs6,vs13
+  xvmaddasp   vs55, vs7,vs13
+.if \Complete==0
+  xxpermdi  vs9,  vs8,   vs8,2   
+  xxpermdi  vs13, vs12, vs12,2   
+.endif    
+  xvmaddasp		vs46, vs6,vs11
+  xvmaddasp		vs47, vs7,vs11
+  xvmaddasp		vs62, vs6,vs15
+  xvmaddasp		vs63, vs7,vs15
+.if \Complete==0
+  xxpermdi  vs11, vs10, vs10,2  
+  xxpermdi  vs15, vs14, vs14,2  
+.endif  
+
+.if \Complete==0
+   lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
+   lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+.endif 
+
+  xvmaddasp		vs32, vs0,vs24
+  xvmaddasp		vs33, vs1,vs24
+  xvmaddasp		vs48, vs0,vs28
+  xvmaddasp		vs49, vs1,vs28
+  xvmaddasp		vs40, vs0,vs26
+  xvmaddasp		vs41, vs1,vs26
+  xvmaddasp		vs56, vs0,vs30
+  xvmaddasp		vs57, vs1,vs30
+  xvmaddasp		vs36, vs0,vs25
+  xvmaddasp		vs37, vs1,vs25
+  xvmaddasp		vs52, vs0,vs29
+  xvmaddasp		vs53, vs1,vs29
+  xvmaddasp		vs44, vs0,vs27
+  xvmaddasp		vs45, vs1,vs27
+  xvmaddasp		vs60, vs0,vs31
+  xvmaddasp		vs61, vs1,vs31 
+.if \Complete==0
+  lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
+  lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG) 
+.endif
+
+  xvmaddasp		vs34, vs2,vs24
+  xvmaddasp		vs35, vs3,vs24	  
+  xvmaddasp		vs50, vs2,vs28
+  xvmaddasp		vs51, vs3,vs28
+.if \Complete==0
+  lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
+  lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
+.endif  
+  xvmaddasp		vs42, vs2,vs26
+  xvmaddasp		vs43, vs3,vs26
+  xvmaddasp		vs58, vs2,vs30
+  xvmaddasp		vs59, vs3,vs30
+.if \Complete==0
+  xxperm    vs26, vs24, permute_mask
+  xxperm    vs30, vs28, permute_mask  
+.endif  
+  xvmaddasp		vs38, vs2,vs25
+  xvmaddasp		vs39, vs3,vs25
+  xvmaddasp		vs54, vs2,vs29
+  xvmaddasp		vs55, vs3,vs29
+.if \Complete==0
+  xxpermdi  vs25, vs24, vs24,2 
+  xxpermdi  vs29, vs28, vs28,2    
+.endif  
+  xvmaddasp		vs46, vs2,vs27
+  xvmaddasp		vs47, vs3,vs27
+  xvmaddasp		vs62, vs2,vs31	
+  xvmaddasp		vs63, vs3,vs31
+.if \Complete==0
+  xxpermdi  vs27, vs26, vs26,2  
+  xxpermdi  vs31, vs30, vs30,2   
+.endif
+.if \Complete==0
+  lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
+  lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
+.endif
+
+
+.if \IsLast==1	
+.if \Complete==1
+	addi		\BREG, \BREG,  DISP16(\Index,\OffsetB)
+  addi    \AREG, \AREG, DISP32(\Index,\OffsetA)  
+
+.else
+	addi		\BREG, \BREG,  DISP16(\Index,64)
+  addi    \AREG, \AREG, DISP32(\Index,128)  
+
+.endif
+.endif   
+
+
+.endm
+
+ 
+.macro SAVE8x16
+
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  add     T4, T2, T10  
+  add     T5, T3, T10 
+
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+
+
+
+   /* permute to restore butterfly rank 1 updateto normal promoted one */  
+    /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC)  vs11 MEM(CO+3*LDC) */
+    /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC)  vs15 MEM(16+CO+3*LDC) */
+    /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC)  vs19 MEM(32+CO+3*LDC) */
+    /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC)  vs27 MEM(32+CO+3*LDC) */
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+#ifndef TRMMKERNEL    
+    lxv        vs32, 0(CO)
+    lxv        vs33, 16(CO) 
+#endif 
+    xxmrglw     vs16,   vs34,   vs46
+    xxmrglw     vs18,   vs38,   vs42   
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+
+    xxmrghw     vs4,    vs38,   vs42
+    xxmrghw     vs5,    vs34,   vs46
+
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxmrglw     vs24,   vs35,   vs47
+    xxmrglw     vs26,   vs39,   vs43  
+
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+
+    xxmrghw     vs30,   vs39,   vs43 
+    xxmrghw     vs31,   vs35,   vs47
+#ifndef TRMMKERNEL       
+    lxv        vs34, 32(CO)  
+    lxv        vs35, 48(CO)      
+#endif
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+#ifndef TRMMKERNEL    
+    lxv        vs36, 0(T1)
+    lxv        vs37, 16(T1) 
+#endif
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+#ifndef TRMMKERNEL      
+    lxv        vs38, 32(T1)  
+    lxv        vs39, 48(T1)     
+#endif
+
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+
+
+
+#ifndef TRMMKERNEL       
+    lxv        vs40, 0(T2)
+    lxv        vs41, 16(T2) 
+#endif
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+#ifndef TRMMKERNEL     
+    lxv        vs42, 32(T2)  
+    lxv        vs43, 48(T2)     
+#endif  
+       
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2  
+#ifndef TRMMKERNEL    
+    lxv        vs44, 0(T3)
+    lxv        vs45, 16(T3)
+#endif
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+#ifndef TRMMKERNEL      
+    lxv        vs46, 32(T3)  
+    lxv        vs47, 48(T3)                 
+#endif  
+
+    
+
+
+      
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r                 
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r            
+#endif 
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+
+ 
+    stxv        vs32, 0(CO)
+    stxv        vs33, 16(CO)     
+#ifdef TRMMKERNEL   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r                 
+#else    
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r           
+#endif 
+         
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2  
+
+
+    stxv        vs34, 32(CO)  
+    stxv        vs35, 48(CO)  
+#ifdef TRMMKERNEL  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r                
+#else   
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r           
+#endif 
+    stxv        vs36, 0(T1)
+    stxv        vs37, 16(T1)
+#ifdef TRMMKERNEL  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else   
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+    stxv        vs38, 32(T1)  
+    stxv        vs39, 48(T1)
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r                    
+#else 
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+#endif   
+
+    stxv        vs40, 0(T2)
+    stxv        vs41, 16(T2)  
+#ifdef TRMMKERNEL 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r                     
+#else   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r
+#endif      
+    stxv        vs42, 32(T2)  
+    stxv        vs43, 48(T2)  
+#ifdef TRMMKERNEL  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r                    
+#else
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r    
+#endif      
+    stxv        vs44, 0(T3)
+    stxv        vs45, 16(T3) 
+#ifdef TRMMKERNEL 
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r 
+#endif      
+    stxv        vs46, 32(T3)  
+    stxv        vs47, 48(T3)
+  
+ /*****the same with the second 8X8 ****/
+ #ifndef TRMMKERNEL 
+    lxv        vs32, 0(T4)
+    lxv        vs33, 16(T4) 
+#endif  
+    xxmrglw     vs8,    vs48,   vs60
+    xxmrglw     vs10,   vs52,   vs56  
+#ifndef TRMMKERNEL    
+    lxv        vs34, 32(T4)  
+    lxv        vs35, 48(T4)  
+#endif  
+    xxmrghw     vs1,    vs48,   vs60
+    xxmrghw     vs0,    vs52,   vs56
+#ifndef TRMMKERNEL        
+    lxv        vs36, 0(T5)
+    lxv        vs37, 16(T5) 
+#endif  
+    xxmrglw     vs12,   vs49,   vs61
+    xxmrglw     vs14,   vs53,   vs57  
+#ifndef TRMMKERNEL    
+    lxv        vs38,32(T5)  
+    lxv        vs39, 48(T5)     
+#endif   
+ 
+    xxmrghw     vs2,    vs53,   vs57
+    xxmrghw     vs3,    vs49,   vs61
+#ifndef TRMMKERNEL   
+    lxv        vs40, 0(T6)
+    lxv        vs41, 16(T6)
+#endif  
+    xxmrglw     vs16,   vs50,   vs62
+    xxmrglw     vs18,   vs54,   vs58   
+#ifndef TRMMKERNEL      
+    lxv        vs42, 32(T6)  
+    lxv        vs43, 48(T6) 
+#endif  
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+    xxmrghw     vs4,    vs54,   vs58
+    xxmrghw     vs5,    vs50,   vs62
+#ifndef TRMMKERNEL              
+    lxv        vs44, 0(T7)
+    lxv        vs45, 16(T7) 
+#endif  
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+ 
+    xxmrglw     vs24,   vs51,   vs63
+    xxmrglw     vs26,   vs55,   vs59 
+#ifndef TRMMKERNEL    
+    lxv        vs46, 32(T7)  
+    lxv        vs47, 48(T7)     
+#endif  
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+    xxmrghw     vs30,   vs55,   vs59 
+    xxmrghw     vs31,   vs51,   vs63 
+
+ 
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+     
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+ #ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r                 
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r            
+#endif  
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+    stxv        vs32, 0(T4)
+    stxv        vs33, 16(T4) 
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2      
+
+#ifdef TRMMKERNEL   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r                 
+#else    
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r           
+#endif 
+    stxv        vs34, 32(T4)  
+    stxv        vs35, 48(T4)  
+
+#ifdef TRMMKERNEL  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r                
+#else   
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r           
+#endif 
+    stxv        vs36, 0(T5)
+    stxv        vs37, 16(T5) 
+
+#ifdef TRMMKERNEL  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else  
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+
+
+
+ 
+    stxv        vs38, 32(T5)  
+    stxv        vs39, 48(T5)
+
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r                    
+#else 
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+#endif  
+    stxv        vs40, 0(T6)
+    stxv        vs41, 16(T6) 
+#ifdef TRMMKERNEL 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r                     
+#else   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r
+#endif  
+    stxv        vs42, 32(T6)  
+    stxv        vs43, 48(T6)  
+#ifdef TRMMKERNEL  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r                    
+#else
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r    
+#endif  
+
+    stxv        vs44, 0(T7)
+    stxv        vs45, 16(T7) 
+#ifdef TRMMKERNEL 
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r 
+#endif  
+ 
+    stxv        vs46, 32(T7)  
+    stxv        vs47, 48(T7)
+  
+
+    addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=8
+**********************************************************************************************/
+
+.macro LOAD8x8_1
+   LOAD8x8 1
+.endm
+
+.macro LOAD8x8_0
+   LOAD8x8 0
+.endm
+
+.macro KERNEL8x8_L1_L4  Index,IsLast
+  KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL8x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END8x8_NORMAL
+  END8x8 0, AO, BO, 32,32 
+.endm
+
+.macro Zero8X8
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+ 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+ 
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+ 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+ 
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+ 
+    xxlxor      vs52,   vs52,   vs52
+    xxlxor      vs53,   vs53,   vs53
+ 
+    xxlxor      vs56,   vs56,   vs56
+    xxlxor      vs57,   vs57,   vs57
+  
+    xxlxor      vs60,   vs60,   vs60
+    xxlxor      vs61,   vs61,   vs61
+    
+.endm
+
+.macro LOAD8x8  Zero
+
+    lxv vs24,   0(BO)
+    lxv vs28,   16(BO)
+    lxv vs0,     0(AO)
+    lxv vs1,    16(AO)
+
+    xxperm      vs26,   vs24,       permute_mask
+    xxperm      vs30,   vs28,       permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2     
+    xxpermdi    vs29,   vs28,   vs28,2    
+
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2      
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45 
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49 
+    xxlxor      vs52,   vs52,   vs52
+    xxlxor      vs53,   vs53,   vs53 
+    xxlxor      vs56,   vs56,   vs56
+    xxlxor      vs57,   vs57,   vs57  
+    xxlxor      vs60,   vs60,   vs60
+    xxlxor      vs61,   vs61,   vs61  
+.endif
+.endm
+
+
+.macro END8x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+.endif
+.endm  
+
+.macro KERNEL8x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs8,    DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs12,   DISP32(\Index,16+\OffsetB)(\BREG)
+
+    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask    
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2   
+
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+    lxv vs24,   DISP32(\Index,32+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,32+16+\OffsetB)(\BREG)
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+    xxperm      vs26,   vs24,       permute_mask
+    xxperm      vs30,   vs28,       permute_mask    
+
+    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+
+    xxpermdi    vs25,   vs24,   vs24,2     
+    xxpermdi    vs29,   vs28,   vs28,2    
+
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2      
+
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+    lxv vs8,    DISP32(\Index,64+\OffsetB)(\BREG)
+    lxv vs12,   DISP32(\Index,64+16+\OffsetB)(\BREG)
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask   
+ 
+
+    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+ 
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2  
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+.if \Complete==0
+    lxv vs24,   DISP32(\Index,96+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,96+16+\OffsetB)(\BREG)
+.endif 
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+.if \Complete==0
+    xxperm      vs26,   vs24,   permute_mask
+    xxperm      vs30,   vs28,   permute_mask   
+.endif 
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+
+.if \Complete==0
+    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
+.endif 
+
+.if \Complete==0     
+    xxpermdi    vs25,   vs24,   vs24,2 
+    xxpermdi    vs29,   vs28,   vs28,2      
+
+.endif 
+.if \IsLast==1  
+.if \Complete==1
+  
+    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
+    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+  
+    addi        \BREG, \BREG,  DISP32(\Index,128)
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif   
+ 
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2  
+    
+.endif
+ 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+.endm
+
+.macro KERNEL8x8 First
+
+  LOAD8x8 0
+  END8x8 \First, AO, BO, 32,32  
+.endm
+
+.macro KERNEL8x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+    
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs12,   DISP16(\Index,16+\OffsetB)(\BREG)
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask
+    xxperm      vs14,   vs12,       permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2    
+    xxpermdi    vs13,   vs12,   vs12,2   
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+.endif
+
+    xxpermdi    vs11,   vs10,   vs10,2  
+    xxpermdi    vs15,   vs14,   vs14,2  
+ 
+.if \First==1  
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+
+    xvmulsp     vs48, vs0,vs28
+    xvmulsp     vs49, vs1,vs28
+
+    xvmulsp     vs52, vs0,vs29
+    xvmulsp     vs53, vs1,vs29
+
+    xvmulsp     vs56, vs0,vs30
+    xvmulsp     vs57, vs1,vs30
+
+    xvmulsp     vs60, vs0,vs31
+    xvmulsp     vs61, vs1,vs31
+
+.else 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+    xvmaddasp       vs48, vs0,vs28
+    xvmaddasp       vs49, vs1,vs28
+
+    xvmaddasp       vs52, vs0,vs29
+    xvmaddasp       vs53, vs1,vs29
+
+    xvmaddasp       vs56, vs0,vs30
+    xvmaddasp       vs57, vs1,vs30
+
+    xvmaddasp       vs60, vs0,vs31
+    xvmaddasp       vs61, vs1,vs31
+
+.endif
+.if \Complete==0
+    lxv vs24,   DISP16(\Index,32+\OffsetB)(\BREG)
+    lxv vs28,   DISP16(\Index,32+16+\OffsetB)(\BREG)
+
+    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,   permute_mask
+    xxperm      vs30,   vs28,   permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2   
+    xxpermdi    vs29,   vs28,   vs28,2  
+.endif    
+.if \IsLast==1  
+.if \Complete==1
+    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB) 
+    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
+
+.else
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+    addi        \AREG, \AREG,  DISP16(\Index,64) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp     vs32, vs4,vs8
+    xvmulsp     vs33, vs5,vs8
+
+    xvmulsp     vs36, vs4,vs9
+    xvmulsp     vs37, vs5,vs9
+
+.else
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.endif 
+ 
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2  
+    xxpermdi    vs31,   vs30,   vs30,2  
+ 
+.endif
+.if \First==1  
+    xvmulsp     vs40, vs4,vs10
+    xvmulsp     vs41, vs5,vs10
+
+    xvmulsp     vs44, vs4,vs11
+    xvmulsp     vs45, vs5,vs11
+
+    xvmulsp     vs48, vs4,vs12
+    xvmulsp     vs49, vs5,vs12
+
+    xvmulsp     vs52, vs4,vs13
+    xvmulsp     vs53, vs5,vs13
+
+    xvmulsp     vs56, vs4,vs14
+    xvmulsp     vs57, vs5,vs14
+
+    xvmulsp     vs60, vs4,vs15
+    xvmulsp     vs61, vs5,vs15
+
+.else 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+    xvmaddasp       vs48, vs4,vs12
+    xvmaddasp       vs49, vs5,vs12
+
+    xvmaddasp       vs52, vs4,vs13
+    xvmaddasp       vs53, vs5,vs13
+
+    xvmaddasp       vs56, vs4,vs14
+    xvmaddasp       vs57, vs5,vs14
+
+    xvmaddasp       vs60, vs4,vs15
+    xvmaddasp       vs61, vs5,vs15
+
+.endif
+
+.endm
+
+
+.macro SAVE8x8 
+ 
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  add     T4, T2, T10  
+  add     T5, T3, T10 
+
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+
+#ifndef TRMMKERNEL    
+    lxv        vs34, 0(CO)
+    lxv        vs35, 16(CO)      
+    lxv        vs38, 0(T1)
+    lxv        vs39, 16(T1)  
+    lxv        vs42, 0(T2)
+    lxv        vs43, 16(T2)     
+    lxv        vs46, 0(T3)
+    lxv        vs47, 16(T3)  
+
+    lxv        vs50, 0(T4)
+    lxv        vs51, 16(T4)      
+    lxv        vs54, 0(T5)
+    lxv        vs55, 16(T5)  
+    lxv        vs58, 0(T6)
+    lxv        vs59, 16(T6)     
+    lxv        vs62, 0(T7)
+    lxv        vs63, 16(T7) 
+#endif  
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+ 
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+      
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs34,   vs8,    alpha_r 
+    xvmulsp     vs35,   vs12,   alpha_r 
+    xvmulsp     vs38,   vs9,    alpha_r 
+    xvmulsp     vs39,   vs13,   alpha_r 
+    xvmulsp     vs42,   vs10,   alpha_r 
+    xvmulsp     vs43,   vs14,   alpha_r 
+    xvmulsp     vs46,   vs11,   alpha_r 
+    xvmulsp     vs47,   vs15,   alpha_r                    
+#else 
+    xvmaddasp   vs34,   vs8,    alpha_r 
+    xvmaddasp   vs35,   vs12,   alpha_r 
+    xvmaddasp   vs38,   vs9,    alpha_r 
+    xvmaddasp   vs39,   vs13,   alpha_r  
+    xvmaddasp   vs42,   vs10,   alpha_r 
+    xvmaddasp   vs43,   vs14,   alpha_r   
+    xvmaddasp   vs46,   vs11,   alpha_r 
+    xvmaddasp   vs47,   vs15,   alpha_r                     
+#endif     
+ 
+   
+    xxmrglw     vs8,    vs48,   vs60
+    xxmrglw     vs10,   vs52,   vs56  
+
+    xxmrghw     vs1,    vs48,   vs60
+    xxmrghw     vs0,    vs52,   vs56
+    stxv        vs34, 0(CO)
+    stxv        vs35, 16(CO) 
+    xxmrglw     vs12,   vs49,   vs61
+    xxmrglw     vs14,   vs53,   vs57  
+    stxv        vs38, 0(T1)
+    stxv        vs39, 16(T1) 
+    xxmrghw     vs2,    vs53,   vs57
+    xxmrghw     vs3,    vs49,   vs61
+    stxv        vs42, 0(T2)
+    stxv        vs43, 16(T2)   
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10  
+    stxv        vs46, 0(T3)
+    stxv        vs47, 16(T3)  
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+   
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    
+ 
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+ 
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+    
+ #ifdef TRMMKERNEL
+    xvmulsp     vs50,   vs8,    alpha_r 
+    xvmulsp     vs51,   vs12,   alpha_r 
+    xvmulsp     vs54,   vs9,    alpha_r 
+    xvmulsp     vs55,   vs13,   alpha_r 
+    xvmulsp     vs58,   vs10,   alpha_r 
+    xvmulsp     vs59,   vs14,   alpha_r 
+    xvmulsp     vs62,   vs11,   alpha_r 
+    xvmulsp     vs63,   vs15,   alpha_r                    
+#else 
+    xvmaddasp     vs50,   vs8,    alpha_r 
+    xvmaddasp     vs51,   vs12,   alpha_r 
+    xvmaddasp     vs54,   vs9,    alpha_r 
+    xvmaddasp     vs55,   vs13,   alpha_r 
+    xvmaddasp     vs58,   vs10,   alpha_r 
+    xvmaddasp     vs59,   vs14,   alpha_r 
+    xvmaddasp     vs62,   vs11,   alpha_r 
+    xvmaddasp     vs63,   vs15,   alpha_r                     
+#endif  
+
+    stxv        vs50, 0(T4)
+    stxv        vs51, 16(T4)      
+    stxv        vs54, 0(T5)
+    stxv        vs55, 16(T5)  
+    stxv        vs58, 0(T6)
+    stxv        vs59, 16(T6)     
+    stxv        vs62, 0(T7)
+    stxv        vs63, 16(T7)   
+
+    addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=4
+**********************************************************************************************/
+
+.macro LOAD8x4_1
+   LOAD8x4 1
+.endm
+
+.macro LOAD8x4_0
+   LOAD8x4 0
+.endm
+
+.macro KERNEL8x4_L1_L4  Index,IsLast
+  KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL8x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL8x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL8x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL8x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL8x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero8X4
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+    
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+    xxlxor      vs50,   vs50,   vs50
+    xxlxor      vs51,   vs51,   vs51  
+    
+.endm
+
+.macro LOAD8x4  Zero
+
+    lxv vs0,     0(AO)
+    lxv vs24,   0(BO)
+    lxv vs25,   16(BO)
+
+
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2        
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+
+    xxlxor      vs48,   vs48,   vs48
+    xxlxor      vs49,   vs49,   vs49
+    xxlxor      vs50,   vs50,   vs50
+    xxlxor      vs51,   vs51,   vs51  
+.endif
+.endm
+
+.macro END8x4_NORMAL
+  END8x4 0, AO, BO, 16,32 
+.endm
+
+.macro END8x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+
+    xvmulsp      vs48,   vs25,   vs0
+    xvmulsp      vs49,   vs25,   vs1
+    xvmulsp      vs50,   vs25,   vs2
+    xvmulsp      vs51,   vs25,   vs3  
+.else
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+.endif
+.endm  
+
+.macro KERNEL8x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP32(\Index, 32+\OffsetB)(\BREG)
+    lxv vs25,   DISP32(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+ 
+
+    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs26,   DISP32(\Index, 64+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index, 80+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+
+.if \Complete==0 
+
+    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
+    lxv vs24,   DISP32(\Index, 96+\OffsetB)(\BREG)
+    lxv vs25,   DISP32(\Index, 96+16+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
+    addi        \BREG, \BREG,  DISP32(\Index,32*3+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+    addi        \BREG, \BREG,  DISP32(\Index,128)
+
+.endif
+.endif   
+ 
+ 
+.endm
+
+.macro KERNEL8x4 First
+    LOAD8x4 0
+    END8x4 \First, AO, BO, 16,32  
+.endm
+
+.macro KERNEL8x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+
+    xvmulsp      vs48,   vs25,   vs0
+    xvmulsp      vs49,   vs25,   vs1
+    xvmulsp      vs50,   vs25,   vs2
+    xvmulsp      vs51,   vs25,   vs3  
+.else 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+
+    xvmaddasp      vs48,   vs25,   vs0
+    xvmaddasp      vs49,   vs25,   vs1
+    xvmaddasp      vs50,   vs25,   vs2
+    xvmaddasp      vs51,   vs25,   vs3 
+.endif
+
+.if \Complete==0 
+
+    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 32+\OffsetB)(\BREG)
+    lxv vs25,   DISP16(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+
+.if \First==1
+    xvmulsp      vs32,   vs26,   vs4
+    xvmulsp      vs33,   vs26,   vs5 
+    xvmulsp      vs34,   vs26,   vs6
+    xvmulsp      vs35,   vs26,   vs7
+
+    xvmulsp      vs48,   vs27,   vs4
+    xvmulsp      vs49,   vs27,   vs5
+    xvmulsp      vs50,   vs27,   vs6
+    xvmulsp      vs51,   vs27,   vs7
+
+
+.else
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+
+    xvmaddasp      vs48,   vs27,   vs4
+    xvmaddasp      vs49,   vs27,   vs5
+    xvmaddasp      vs50,   vs27,   vs6
+    xvmaddasp      vs51,   vs27,   vs7
+.endif
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
+    addi        \BREG, \BREG,  DISP16(\Index,32+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+
+.endif
+.endif   
+     
+  
+.endm
+
+
+.macro SAVE8x4
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+#if !defined(TRMMKERNEL)  
+  lxv        vs36, 0(CO)
+  lxv        vs37, 0(T1)
+#endif  
+  add     T2, CO, T10  
+  add     T3, T1, T10 
+#if !defined(TRMMKERNEL)    
+  lxv        vs38, 0(T2)
+  lxv        vs39, 0(T3)   
+#endif   
+  add     T4, T2, T10 
+  add     T5, T3, T10
+#if !defined(TRMMKERNEL)    
+  lxv        vs40, 0(T4)
+  lxv        vs41, 0(T5)
+#endif  
+  add     T6, T4, T10 
+  add     T7, T5, T10
+#if !defined(TRMMKERNEL)    
+  lxv        vs42, 0(T6)
+  lxv        vs43, 0(T7)
+#endif
+  xxmrglw  vs0, vs35,vs32
+  xxmrglw  vs1, vs34,vs33 
+  xxmrglw  vs4, vs32,vs35
+  xxmrglw  vs5, vs33,vs34 
+
+
+  xxmrghw  vs2, vs35,vs32
+  xxmrghw  vs3, vs34,vs33 
+  xxmrghw  vs6, vs32,vs35
+  xxmrghw  vs7, vs33,vs34  
+
+  xxmrgld  vs24, vs1, vs0  
+  xxmrghd  vs25,vs5,vs4 
+
+  xxmrgld  vs26, vs2, vs3  
+  xxmrghd  vs27,vs6,vs7
+
+
+  xxmrglw  vs0, vs51,vs48
+  xxmrglw  vs1, vs50,vs49  
+  xxmrglw  vs4, vs48,vs51
+  xxmrglw  vs5, vs49,vs50 
+
+  xxmrghw  vs2, vs51,vs48
+  xxmrghw  vs3, vs50,vs49  
+  xxmrghw  vs6, vs48,vs51
+  xxmrghw  vs7, vs49,vs50   
+
+  xxmrgld  vs28, vs1, vs0  
+  xxmrghd  vs29,vs5,vs4
+
+  xxmrgld  vs30, vs2, vs3   
+  xxmrghd  vs31,vs6,vs7
+#if defined(TRMMKERNEL)
+
+  xvmulsp        vs36, vs24, alpha_r
+  xvmulsp        vs37, vs25, alpha_r 
+  xvmulsp        vs38, vs26, alpha_r
+  xvmulsp        vs39, vs27, alpha_r   
+  xvmulsp        vs40, vs28, alpha_r
+  xvmulsp        vs41, vs29, alpha_r 
+  xvmulsp        vs42, vs30, alpha_r
+  xvmulsp        vs43, vs31, alpha_r
+#else
+  xvmaddasp        vs36, vs24, alpha_r
+  xvmaddasp        vs37, vs25, alpha_r 
+  xvmaddasp        vs38, vs26, alpha_r
+  xvmaddasp        vs39, vs27, alpha_r   
+  xvmaddasp        vs40, vs28, alpha_r
+  xvmaddasp        vs41, vs29, alpha_r 
+  xvmaddasp        vs42, vs30, alpha_r
+  xvmaddasp        vs43, vs31, alpha_r
+#endif
+
+  stxv        vs36, 0(CO)
+  stxv        vs37, 0(T1) 
+  stxv        vs38, 0(T2)
+  stxv        vs39, 0(T3)   
+  stxv        vs40, 0(T4)
+  stxv        vs41, 0(T5) 
+  stxv        vs42, 0(T6)
+  stxv        vs43, 0(T7)
+
+
+  addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=2
+**********************************************************************************************/
+
+ 
+.macro KERNEL8x2_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+
+.macro Zero8x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3
+       
+.endm
+ 
+.macro KERNEL8x2
+  KERNEL8x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs36, 0 
+    xxspltw   vs9,  vs36, 1  
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs26,   vs9
+    xvmulsp      vs3,   vs27,   vs9 
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9
+    xvmaddasp      vs3,   vs27,   vs9
+ 
+ .endif
+   
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+    addi        \BREG, \BREG, DISP8(\Index,32)
+ 
+.endm
+
+.macro KERNEL8x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetB)(\BREG)
+    lxv vs28,   DISP16(\Index,32+\OffsetB)(\BREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs4, 2  
+    xxspltw   vs9,  vs4, 3 
+    xxspltw   vs10, vs4, 0 
+    xxspltw   vs11, vs4, 1
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs26,   vs9
+    xvmulsp      vs3,   vs27,   vs9 
+
+    xvmulsp      vs0,   vs28,   vs10
+    xvmulsp      vs1,   vs29,   vs10 
+    xvmulsp      vs2,   vs28,   vs11
+    xvmulsp      vs3,   vs29,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9
+    xvmaddasp      vs3,   vs27,   vs9
+
+    xvmaddasp      vs0,   vs28,   vs10
+    xvmaddasp      vs1,   vs29,   vs10 
+    xvmaddasp      vs2,   vs28,   vs11
+    xvmaddasp      vs3,   vs29,   vs11  
+ .endif
+
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE8x2
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  add     T4, T2, T10 
+  add     T5, T3, T10 
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO) 
+  lxssp  v1,4(CO) 
+
+  lxssp  v2,0(T1)
+  lxssp  v3,4(T1)
+
+  lxssp  v4,0(T2)
+  lxssp  v5,4(T2)
+
+  lxssp  v6,0(T3)
+  lxssp  v7,4(T3)
+
+  lxssp  v8,0(T4)
+  lxssp  v9,4(T4)
+
+  lxssp  v10,0(T5)
+  lxssp  v11,4(T5)
+
+  lxssp  v12,0(T6)
+  lxssp  v13,4(T6)
+
+  lxssp  v14,0(T7)
+  lxssp  v15,4(T7)
+#endif
+  xscvspdp  vs5, vs2
+  xxspltw   vs6, vs2, 1 
+  xxspltw   vs7, vs2, 2 
+  xxspltw   vs8, vs2, 3  
+  xscvspdp  vs6,vs6
+  xscvspdp  vs7,vs7
+  xscvspdp  vs8,vs8
+
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+
+  xscvspdp  vs9, vs3
+  xxspltw   vs10, vs3, 1 
+  xxspltw   vs11, vs3, 2 
+  xxspltw   vs12, vs3, 3  
+  xscvspdp  vs10,vs10
+  xscvspdp  vs11,vs11
+  xscvspdp  vs12,vs12
+
+  xscvspdp  vs28, vs1
+  xxspltw   vs29, vs1, 1 
+  xxspltw   vs30, vs1, 2 
+  xxspltw   vs31, vs1, 3  
+  xscvspdp  vs29,vs29
+  xscvspdp  vs30,vs30
+  xscvspdp  vs31,vs31
+
+
+
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs8, vs4 
+  xsmuldp  vs33,vs27, vs4 
+
+  xsmuldp  vs34,vs7, vs4 
+  xsmuldp  vs35,vs26, vs4 
+
+  xsmuldp  vs36,vs6, vs4 
+  xsmuldp  vs37,vs25, vs4  
+
+  xsmuldp  vs38,vs5, vs4 
+  xsmuldp  vs39,vs24, vs4  
+
+  xsmuldp  vs40,vs12, vs4 
+  xsmuldp  vs41,vs31, vs4
+
+  xsmuldp  vs42,vs11, vs4 
+  xsmuldp  vs43,vs30, vs4  
+
+  xsmuldp  vs44,vs10, vs4 
+  xsmuldp  vs45,vs29, vs4 
+
+  xsmuldp  vs46,vs9, vs4 
+  xsmuldp  vs47,vs28, vs4      
+#else
+  xsmaddadp  vs32,vs8, vs4 
+  xsmaddadp  vs33,vs27, vs4 
+
+  xsmaddadp  vs34,vs7, vs4 
+  xsmaddadp  vs35,vs26, vs4 
+
+  xsmaddadp  vs36,vs6, vs4 
+  xsmaddadp  vs37,vs25, vs4  
+
+  xsmaddadp  vs38,vs5, vs4 
+  xsmaddadp  vs39,vs24, vs4  
+
+  xsmaddadp  vs40,vs12, vs4 
+  xsmaddadp  vs41,vs31, vs4
+
+  xsmaddadp  vs42,vs11, vs4 
+  xsmaddadp  vs43,vs30, vs4  
+
+  xsmaddadp  vs44,vs10, vs4 
+  xsmaddadp  vs45,vs29, vs4 
+
+  xsmaddadp  vs46,vs9, vs4 
+  xsmaddadp  vs47,vs28, vs4     
+#endif  
+
+  stxssp  v0,0(CO) 
+  stxssp  v1,4(CO) 
+
+  stxssp  v2,0(T1)
+  stxssp  v3,4(T1)
+
+  stxssp  v4,0(T2)
+  stxssp  v5,4(T2)
+
+  stxssp  v6,0(T3)
+  stxssp  v7,4(T3)
+
+  stxssp  v8,0(T4)
+  stxssp  v9,4(T4)
+
+  stxssp  v10,0(T5)
+  stxssp  v11,4(T5)
+
+  stxssp  v12,0(T6)
+  stxssp  v13,4(T6)
+
+  stxssp  v14,0(T7)
+  stxssp  v15,4(T7)
+ 
+
+  addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=8 and M=1
+**********************************************************************************************/
+.macro KERNEL8x1_4   OffsetA,OffsetB, Index,IsLast
+  KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero8x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+.endm
+
+.macro KERNEL8x1
+  KERNEL8x1_1 AO,BO, 0 
+.endm
+
+.macro KERNEL8x1_2
+  KERNEL8x1_2_1 AO,BO, 0 
+.endm
+
+.macro KERNEL8x1_1 AREG,BREG,First 
+    lxvwsx vs8,  0, \AREG
+    lxv vs26,   0(\BREG)
+    lxv vs27,   16(\BREG)      
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ .endif
+    addi        \AREG, \AREG,  4  
+    addi        \BREG, \BREG,  32
+.endm
+
+.macro KERNEL8x1_2_1 AREG,BREG,First 
+    lxsd v4,    0(\AREG)
+    lxv vs26,   0(\BREG)
+    lxv vs27,  16(\BREG)      
+    lxv vs28,  32(\BREG)
+    lxv vs29,  48(\BREG) 
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0  
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+    xvmulsp      vs0,   vs28,   vs9
+    xvmulsp      vs1,   vs29,   vs9     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9
+    xvmaddasp      vs1,   vs29,   vs9 
+ .endif
+    addi        \AREG, \AREG,  8 
+    addi        \BREG, \BREG,  64
+.endm
+
+.macro KERNEL8x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    xxspltw   vs8,  vs4, 3 
+    xxspltw   vs9,  vs4, 2 
+    xxspltw   vs10, vs4, 1 
+    xxspltw   vs11, vs4, 0
+    lxv vs26,   DISP32(\Index, 0+\OffsetB)(\BREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetB)(\BREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetB)(\BREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetB)(\BREG) 
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetB)(\BREG)
+    lxv vs31,   DISP32(\Index,64+16+\OffsetB)(\BREG)
+    lxv vs32,   DISP32(\Index,64+32+\OffsetB)(\BREG)
+    lxv vs33,   DISP32(\Index,64+48+\OffsetB)(\BREG)         
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+    xvmulsp      vs0,   vs28,   vs9
+    xvmulsp      vs1,   vs29,   vs9     
+    xvmulsp      vs0,   vs30,   vs10
+    xvmulsp      vs1,   vs31,   vs10  
+    xvmulsp      vs0,   vs32,   vs11
+    xvmulsp      vs1,   vs33,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9
+    xvmaddasp      vs1,   vs29,   vs9     
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10  
+    xvmaddasp      vs0,   vs32,   vs11
+    xvmaddasp      vs1,   vs33,   vs11  
+ .endif
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP32(\Index,128)
+.endif 
+.endm
+
+.macro SAVE8x1
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  add     T4, T2, T10 
+  add     T5, T3, T10 
+  add     T6, T4, T10 
+  add     T7, T5, T10 
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO)  
+  lxssp  v2,0(T1) 
+  lxssp  v4,0(T2) 
+  lxssp  v6,0(T3) 
+  lxssp  v8,0(T4) 
+  lxssp  v10,0(T5) 
+  lxssp  v12,0(T6) 
+  lxssp  v14,0(T7)
+#endif
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+  xscvspdp  vs28, vs1
+  xxspltw   vs29, vs1, 1 
+  xxspltw   vs30, vs1, 2 
+  xxspltw   vs31, vs1, 3  
+  xscvspdp  vs29,vs29
+  xscvspdp  vs30,vs30
+  xscvspdp  vs31,vs31
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs27, vs4 
+  xsmuldp  vs34,vs26, vs4 
+  xsmuldp  vs36,vs25, vs4 
+  xsmuldp  vs38,vs24, vs4 
+  xsmuldp  vs40,vs31, vs4 
+  xsmuldp  vs42,vs30, vs4 
+  xsmuldp  vs44,vs29, vs4 
+  xsmuldp  vs46,vs28, vs4 
+#else
+  xsmaddadp  vs32,vs27, vs4 
+  xsmaddadp  vs34,vs26, vs4 
+  xsmaddadp  vs36,vs25, vs4 
+  xsmaddadp  vs38,vs24, vs4 
+  xsmaddadp  vs40,vs31, vs4 
+  xsmaddadp  vs42,vs30, vs4 
+  xsmaddadp  vs44,vs29, vs4 
+  xsmaddadp  vs46,vs28, vs4  
+#endif  
+  stxssp  v0,0(CO)  
+  stxssp  v2,0(T1) 
+  stxssp  v4,0(T2) 
+  stxssp  v6,0(T3) 
+  stxssp  v8,0(T4) 
+  stxssp  v10,0(T5) 
+  stxssp  v12,0(T6) 
+  stxssp  v14,0(T7) 
+  addi CO,CO,4
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+.macro LOAD4x16_1
+   LOAD4x16 1
+.endm
+
+.macro LOAD4x16_0
+   LOAD4x16 0
+.endm
+
+.macro KERNEL4x16_L1_L4  Index,IsLast
+  KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x16_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x16_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x16_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x16_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x16_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X16
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47	
+.endm
+
+.macro LOAD4x16  Zero
+
+	lxv	vs24,	0(BO) 
+	lxv	vs0,	 0(AO)
+	lxv	vs1,	16(AO)
+	lxv	vs2,	32(AO)
+	lxv	vs3,	48(AO)
+	xxperm  	vs26,	vs24,		permute_mask 	
+	xxpermdi	vs25,	vs24,	vs24,2 
+	xxpermdi	vs27,	vs26,	vs26,2	 	
+
+.if \Zero==1 
+    xxlxor		vs32,	vs32,	vs32
+    xxlxor		vs33,	vs33,	vs33
+	xxlxor		vs34,	vs34,	vs34
+	xxlxor		vs35,	vs35,	vs35
+	xxlxor		vs36,	vs36,	vs36
+	xxlxor		vs37,	vs37,	vs37
+	xxlxor		vs38,	vs38,	vs38
+	xxlxor		vs39,	vs39,	vs39
+	xxlxor		vs40,	vs40,	vs40
+	xxlxor		vs41,	vs41,	vs41
+	xxlxor		vs42,	vs42,	vs42
+	xxlxor		vs43,	vs43,	vs43
+	xxlxor		vs44,	vs44,	vs44
+	xxlxor		vs45,	vs45,	vs45
+	xxlxor		vs46,	vs46,	vs46
+	xxlxor		vs47,	vs47,	vs47
+ 
+.endif
+.endm
+
+.macro END4x16_NORMAL
+  END4x16 0, AO, BO, 64,16 
+.endm
+
+.macro END4x16 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+    xvmulsp     vs34, vs2,vs24  
+    xvmulsp     vs35, vs3,vs24  
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+    xvmulsp     vs38, vs2,vs25  
+    xvmulsp     vs39, vs3,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+    xvmulsp     vs42, vs2,vs26  
+    xvmulsp     vs43, vs3,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+    xvmulsp     vs46, vs2,vs27  
+    xvmulsp     vs47, vs3,vs27
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+    xvmaddasp       vs34, vs2,vs24  
+    xvmaddasp       vs35, vs3,vs24  
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+    xvmaddasp       vs38, vs2,vs25  
+    xvmaddasp       vs39, vs3,vs25 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+    xvmaddasp       vs42, vs2,vs26  
+    xvmaddasp       vs43, vs3,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+    xvmaddasp       vs46, vs2,vs27  
+    xvmaddasp       vs47, vs3,vs27
+
+.endif
+.endm  
+
+.macro KERNEL4x16_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+	lxv	vs8,	DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+ 	lxv	vs4,	DISP64(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25 
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+
+
+	lxv	vs24,	DISP16(\Index,16+\OffsetB)(\BREG) 
+
+	lxv	vs0,	DISP64(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP64(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,		permute_mask 
+	xxpermdi	vs25,	vs24,	vs24,2	    
+ 
+
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+         
+	xxpermdi	vs27,	vs26,	vs26,2	 	
+
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+ 
+
+	lxv	vs8,	DISP16(\Index,32+\OffsetB)(\BREG) 
+
+ 	lxv	vs4,	DISP64(\Index,128+0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP64(\Index,128+16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP64(\Index,128+32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP64(\Index,128+48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24	 
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 
+
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+
+ 
+ 
+.if \Complete==0
+	lxv	vs24,	DISP16(\Index,48+\OffsetB)(\BREG) 
+
+	lxv	vs0,	DISP64(\Index,192+\OffsetA)(\AREG)
+	lxv	vs1,	DISP64(\Index,192+16+\OffsetA)(\AREG) 
+	lxv	vs2,	DISP64(\Index,192+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP64(\Index,192+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask 	
+	xxpermdi	vs25,	vs24,	vs24,2  	
+
+.endif 
+.if \IsLast==1	
+.if \Complete==1
+  
+	addi		\BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+	addi		\AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
+.else
+  
+	addi		\BREG, \BREG,  DISP16(\Index,64)
+	addi		\AREG, \AREG, DISP64(\Index,256)
+.endif
+.endif   
+ 
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	 
+ 
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+  
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	 
+ 	
+.endif
+ 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+ 
+
+.endm
+
+.macro KERNEL4x16 First
+
+  LOAD4x16 0
+  END4x16 \First, AO, BO, 64,16 
+.endm
+
+.macro KERNEL4x16_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+	
+	lxv	vs8,	DISP8(\Index, 0+\OffsetB)(\BREG) 
+ 	lxv	vs4,	DISP32(\Index, 0+\OffsetA)(\AREG)
+	lxv	vs5,	DISP32(\Index,16+\OffsetA)(\AREG)
+	lxv	vs6,	DISP32(\Index,32+\OffsetA)(\AREG)
+	lxv	vs7,	DISP32(\Index,48+\OffsetA)(\AREG) 
+
+	xxperm  	vs10,	vs8,		permute_mask 
+	xxpermdi	vs9,	vs8,	vs8,2	  
+.if \First==1
+    xvmulsp		vs32, vs0,vs24
+	xvmulsp		vs33, vs1,vs24
+	xvmulsp		vs34, vs2,vs24	
+	xvmulsp		vs35, vs3,vs24	
+
+    xvmulsp		vs36, vs0,vs25
+	xvmulsp		vs37, vs1,vs25
+	xvmulsp		vs38, vs2,vs25	
+	xvmulsp		vs39, vs3,vs25	
+.else
+    xvmaddasp		vs32, vs0,vs24
+	xvmaddasp		vs33, vs1,vs24
+	xvmaddasp		vs34, vs2,vs24	
+	xvmaddasp		vs35, vs3,vs24
+
+    xvmaddasp		vs36, vs0,vs25
+	xvmaddasp		vs37, vs1,vs25
+	xvmaddasp		vs38, vs2,vs25	
+	xvmaddasp		vs39, vs3,vs25		
+.endif
+
+ 	xxpermdi	vs11,	vs10,	vs10,2	 	
+ 
+.if \First==1  
+    xvmulsp		vs40, vs0,vs26
+	xvmulsp		vs41, vs1,vs26
+	xvmulsp		vs42, vs2,vs26	
+	xvmulsp		vs43, vs3,vs26
+
+    xvmulsp		vs44, vs0,vs27
+	xvmulsp		vs45, vs1,vs27
+	xvmulsp		vs46, vs2,vs27	
+	xvmulsp		vs47, vs3,vs27
+
+  
+.else 
+    xvmaddasp		vs40, vs0,vs26
+	xvmaddasp		vs41, vs1,vs26
+	xvmaddasp		vs42, vs2,vs26	
+	xvmaddasp		vs43, vs3,vs26
+
+    xvmaddasp		vs44, vs0,vs27
+	xvmaddasp		vs45, vs1,vs27
+	xvmaddasp		vs46, vs2,vs27	
+	xvmaddasp		vs47, vs3,vs27
+ 
+
+.endif
+.if \Complete==0
+	lxv	vs24,	DISP8(\Index,16+\OffsetB)(\BREG) 
+	lxv	vs0,	DISP32(\Index,64+\OffsetA)(\AREG)
+	lxv	vs1,	DISP32(\Index,64+16+\OffsetA)(\AREG)
+	lxv	vs2,	DISP32(\Index,64+32+\OffsetA)(\AREG)
+	lxv	vs3,	DISP32(\Index,64+48+\OffsetA)(\AREG)
+
+	xxperm  	vs26,	vs24,	permute_mask 
+	xxpermdi	vs25,	vs24,	vs24,2	  
+.endif    
+.if \IsLast==1	
+.if \Complete==1
+ 	addi		\BREG, \BREG,  DISP8(\Index,16+\OffsetB) 
+	addi		\AREG, \AREG, DISP32(\Index,64+\OffsetA)
+
+.else
+  	addi		\BREG, \BREG,  DISP8(\Index,32)
+	addi		\AREG, \AREG, DISP32(\Index,128) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp		vs32, vs4,vs8
+	xvmulsp		vs33, vs5,vs8
+	xvmulsp		vs34, vs6,vs8	
+	xvmulsp		vs35, vs7,vs8
+
+    xvmulsp		vs36, vs4,vs9
+	xvmulsp		vs37, vs5,vs9
+	xvmulsp		vs38, vs6,vs9	
+	xvmulsp		vs39, vs7,vs9
+.else
+    xvmaddasp		vs32, vs4,vs8
+	xvmaddasp		vs33, vs5,vs8
+	xvmaddasp		vs34, vs6,vs8	
+	xvmaddasp		vs35, vs7,vs8	
+
+    xvmaddasp		vs36, vs4,vs9
+	xvmaddasp		vs37, vs5,vs9
+	xvmaddasp		vs38, vs6,vs9	
+	xvmaddasp		vs39, vs7,vs9
+.endif 
+ 
+.if \Complete==0        
+	xxpermdi	vs27,	vs26,	vs26,2	 
+ 
+.endif
+.if \First==1  
+    xvmulsp		vs40, vs4,vs10
+	xvmulsp		vs41, vs5,vs10
+	xvmulsp		vs42, vs6,vs10	
+	xvmulsp		vs43, vs7,vs10
+
+    xvmulsp		vs44, vs4,vs11
+	xvmulsp		vs45, vs5,vs11
+	xvmulsp		vs46, vs6,vs11	
+	xvmulsp		vs47, vs7,vs11
+
+ 
+
+.else 
+    xvmaddasp		vs40, vs4,vs10
+	xvmaddasp		vs41, vs5,vs10
+	xvmaddasp		vs42, vs6,vs10	
+	xvmaddasp		vs43, vs7,vs10
+
+    xvmaddasp		vs44, vs4,vs11
+	xvmaddasp		vs45, vs5,vs11
+	xvmaddasp		vs46, vs6,vs11	
+	xvmaddasp		vs47, vs7,vs11
+
+ 
+
+.endif
+
+.endm
+
+ 
+.macro SAVE4x16
+
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+  
+ 
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxmrglw     vs16,   vs34,   vs46
+    xxmrglw     vs18,   vs38,   vs42   
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+
+    xxmrghw     vs4,    vs38,   vs42
+    xxmrghw     vs5,    vs34,   vs46
+
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxmrglw     vs24,   vs35,   vs47
+    xxmrglw     vs26,   vs39,   vs43  
+
+    xxlor      vs17,    vs16,   vs16
+    xxlor      vs19,    vs18,   vs18
+
+    xxmrghw     vs30,   vs39,   vs43 
+    xxmrghw     vs31,   vs35,   vs47
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+#ifndef TRMMKERNEL    
+    lxv        vs32, 0(CO)
+    lxv        vs33, 16(CO) 
+    lxv        vs34, 32(CO)  
+    lxv        vs35, 48(CO)      
+#endif
+    xxlor      vs25,    vs24,   vs24
+    xxlor      vs27,    vs26,   vs26 
+
+#ifndef TRMMKERNEL    
+    lxv        vs36, 0(T1)
+    lxv        vs37, 16(T1) 
+    lxv        vs38, 32(T1)  
+    lxv        vs39, 48(T1)     
+#endif
+#ifndef TRMMKERNEL       
+    lxv        vs40, 0(T2)
+    lxv        vs41, 16(T2) 
+    lxv        vs42, 32(T2)  
+    lxv        vs43, 48(T2)     
+#endif  
+#ifndef TRMMKERNEL    
+    lxv        vs44, 0(T3)
+    lxv        vs45, 16(T3) 
+    lxv        vs46, 32(T3)  
+    lxv        vs47, 48(T3)                 
+#endif  
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+       
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+    xxperm     vs16,    vs4,    save_permute_1
+    xxperm     vs18,    vs5,    save_permute_1
+      
+    xxperm     vs17,    vs4,    save_permute_2   
+    xxperm     vs19,    vs5,    save_permute_2      
+
+    xxperm     vs24,    vs30,   save_permute_1
+    xxperm     vs26,    vs31,   save_permute_1 
+         
+    xxperm     vs25,    vs30,   save_permute_2   
+    xxperm     vs27,    vs31,   save_permute_2  
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs32,   vs8,    alpha_r 
+    xvmulsp     vs33,   vs12,   alpha_r   
+    xvmulsp     vs34,   vs16,   alpha_r 
+    xvmulsp     vs35,   vs24,   alpha_r  
+    xvmulsp     vs36,   vs9,    alpha_r 
+    xvmulsp     vs37,   vs13,   alpha_r  
+    xvmulsp     vs38,   vs17,   alpha_r 
+    xvmulsp     vs39,   vs25,   alpha_r               
+#else 
+    xvmaddasp   vs32,   vs8,    alpha_r 
+    xvmaddasp   vs33,   vs12,   alpha_r   
+    xvmaddasp   vs34,   vs16,   alpha_r 
+    xvmaddasp   vs35,   vs24,   alpha_r  
+    xvmaddasp   vs36,   vs9,    alpha_r 
+    xvmaddasp   vs37,   vs13,   alpha_r   
+    xvmaddasp   vs38,   vs17,   alpha_r 
+    xvmaddasp   vs39,   vs25,   alpha_r         
+#endif 
+
+
+
+#ifdef TRMMKERNEL
+    xvmulsp     vs40,   vs10,   alpha_r 
+    xvmulsp     vs41,   vs14,   alpha_r 
+    xvmulsp     vs42,   vs18,   alpha_r 
+    xvmulsp     vs43,   vs26,   alpha_r  
+    xvmulsp     vs44,   vs11,   alpha_r 
+    xvmulsp     vs45,   vs15,   alpha_r  
+    xvmulsp     vs46,   vs19,   alpha_r 
+    xvmulsp     vs47,   vs27,   alpha_r                   
+#else
+
+    xvmaddasp   vs40,   vs10,   alpha_r 
+    xvmaddasp   vs41,   vs14,   alpha_r   
+    xvmaddasp   vs42,   vs18,   alpha_r 
+    xvmaddasp   vs43,   vs26,   alpha_r  
+    xvmaddasp   vs44,   vs11,   alpha_r 
+    xvmaddasp   vs45,   vs15,   alpha_r 
+    xvmaddasp   vs46,   vs19,   alpha_r 
+    xvmaddasp   vs47,   vs27,   alpha_r  
+        
+#endif  
+
+    stxv        vs32, 0(CO)
+    stxv        vs33, 16(CO) 
+    stxv        vs34, 32(CO)  
+    stxv        vs35, 48(CO)  
+
+    stxv        vs36, 0(T1)
+    stxv        vs37, 16(T1)  
+    stxv        vs38, 32(T1)  
+    stxv        vs39, 48(T1)
+
+    stxv        vs40, 0(T2)
+    stxv        vs41, 16(T2)  
+    stxv        vs42, 32(T2)  
+    stxv        vs43, 48(T2)  
+    stxv        vs44, 0(T3)
+    stxv        vs45, 16(T3) 
+    stxv        vs46, 32(T3)  
+    stxv        vs47, 48(T3)
+   
+    addi CO,CO,64
+
+
+.endm
+
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+.macro LOAD4x8_1
+   LOAD4x8 1
+.endm
+
+.macro LOAD4x8_0
+   LOAD4x8 0
+.endm
+
+.macro KERNEL4x8_L1_L4  Index,IsLast
+  KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x8_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x8_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x8_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x8_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x8_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro END4x8_NORMAL
+  END4x8 0, AO, BO, 32,16 
+.endm
+
+.macro Zero4X8
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33
+ 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+ 
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41
+ 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45
+    
+.endm
+
+.macro LOAD4x8  Zero
+
+    lxv vs24,   0(BO) 
+    lxv vs0,     0(AO)
+    lxv vs1,    16(AO)
+
+    xxperm      vs26,   vs24,       permute_mask    
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+    xxpermdi    vs27,   vs26,   vs26,2      
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs36,   vs36,   vs36
+    xxlxor      vs37,   vs37,   vs37
+    xxlxor      vs40,   vs40,   vs40
+    xxlxor      vs41,   vs41,   vs41 
+    xxlxor      vs44,   vs44,   vs44
+    xxlxor      vs45,   vs45,   vs45 
+ 
+.endif
+.endm
+
+
+.macro END4x8 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+ 
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+ 
+
+.endif
+.endm  
+
+.macro KERNEL4x8_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs4,    DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask    
+    xxpermdi    vs9,    vs8,    vs8,2     
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2   
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+ 
+
+    lxv vs24,   DISP16(\Index,16+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,       permute_mask   
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+    xxpermdi    vs27,   vs26,   vs26,2       
+
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+ 
+
+    lxv vs8,    DISP16(\Index,32+\OffsetB)(\BREG) 
+
+    lxv vs4,    DISP32(\Index,64+0+\OffsetA)(\AREG)
+    lxv vs5,    DISP32(\Index,64+16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask     
+    xxpermdi    vs9,    vs8,    vs8,2     
+
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+    xxpermdi    vs11,   vs10,   vs10,2   
+
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+
+ 
+
+.if \Complete==0
+    lxv vs24,   DISP16(\Index,48+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP32(\Index,96+\OffsetA)(\AREG)
+    lxv vs1,    DISP32(\Index,96+16+\OffsetA)(\AREG) 
+
+    xxperm      vs26,   vs24,   permute_mask     
+    xxpermdi    vs25,   vs24,   vs24,2      
+
+.endif 
+.if \IsLast==1  
+.if \Complete==1
+  
+    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+    addi        \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
+.else
+  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif
+.endif   
+ 
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2    
+    
+.endif
+ 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11
+
+ 
+
+.endm
+
+.macro KERNEL4x8 First
+
+  LOAD4x8 0
+  END4x8 \First, AO, BO, 32,16  
+.endm
+
+.macro KERNEL4x8_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+    
+    lxv vs8,     DISP8(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs5,    DISP16(\Index,16+\OffsetA)(\AREG)
+
+    xxperm      vs10,   vs8,        permute_mask  
+    xxpermdi    vs9,    vs8,    vs8,2     
+.if \First==1
+    xvmulsp     vs32, vs0,vs24
+    xvmulsp     vs33, vs1,vs24
+
+    xvmulsp     vs36, vs0,vs25
+    xvmulsp     vs37, vs1,vs25
+
+.else
+    xvmaddasp       vs32, vs0,vs24
+    xvmaddasp       vs33, vs1,vs24
+
+    xvmaddasp       vs36, vs0,vs25
+    xvmaddasp       vs37, vs1,vs25
+
+.endif
+
+    xxpermdi    vs11,   vs10,   vs10,2    
+ 
+.if \First==1  
+    xvmulsp     vs40, vs0,vs26
+    xvmulsp     vs41, vs1,vs26
+
+    xvmulsp     vs44, vs0,vs27
+    xvmulsp     vs45, vs1,vs27
+ 
+
+.else 
+    xvmaddasp       vs40, vs0,vs26
+    xvmaddasp       vs41, vs1,vs26
+
+    xvmaddasp       vs44, vs0,vs27
+    xvmaddasp       vs45, vs1,vs27
+ 
+
+.endif
+.if \Complete==0
+    lxv vs24,    DISP8(\Index,16+\OffsetB)(\BREG) 
+
+    lxv vs0,    DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs1,    DISP16(\Index,32+16+\OffsetA)(\AREG)
+
+    xxperm      vs26,   vs24,   permute_mask   
+    xxpermdi    vs25,   vs24,   vs24,2    
+.endif    
+.if \IsLast==1  
+.if \Complete==1
+    addi        \BREG, \BREG,   DISP8(\Index,16+\OffsetB) 
+    addi        \AREG, \AREG,  DISP16(\Index,32+\OffsetA)
+
+.else
+    addi        \BREG, \BREG,   DISP8(\Index,32)
+    addi        \AREG, \AREG,  DISP16(\Index,64) 
+.endif
+.endif
+
+.if \First==1
+    xvmulsp     vs32, vs4,vs8
+    xvmulsp     vs33, vs5,vs8
+
+    xvmulsp     vs36, vs4,vs9
+    xvmulsp     vs37, vs5,vs9
+
+.else
+    xvmaddasp       vs32, vs4,vs8
+    xvmaddasp       vs33, vs5,vs8
+
+    xvmaddasp       vs36, vs4,vs9
+    xvmaddasp       vs37, vs5,vs9
+
+.endif 
+ 
+.if \Complete==0        
+    xxpermdi    vs27,   vs26,   vs26,2   
+ 
+.endif
+.if \First==1  
+    xvmulsp     vs40, vs4,vs10
+    xvmulsp     vs41, vs5,vs10
+
+    xvmulsp     vs44, vs4,vs11
+    xvmulsp     vs45, vs5,vs11
+ 
+.else 
+    xvmaddasp       vs40, vs4,vs10
+    xvmaddasp       vs41, vs5,vs10
+
+    xvmaddasp       vs44, vs4,vs11
+    xvmaddasp       vs45, vs5,vs11 
+
+.endif
+
+.endm
+
+
+.macro SAVE4x8 
+ 
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+
+  add     T2, CO, T10  
+  add     T3, T1, T10  
+
+ 
+
+#ifndef TRMMKERNEL    
+    lxv        vs34, 0(CO)
+    lxv        vs35, 16(CO)      
+    lxv        vs38, 0(T1)
+    lxv        vs39, 16(T1)  
+    lxv        vs42, 0(T2)
+    lxv        vs43, 16(T2)     
+    lxv        vs46, 0(T3)
+    lxv        vs47, 16(T3)  
+
+ 
+#endif  
+
+    xxmrglw     vs8,    vs32,   vs44
+    xxmrglw     vs10,   vs36,   vs40  
+
+    xxmrghw     vs1,    vs32,   vs44
+    xxmrghw     vs0,    vs36,   vs40
+
+    xxmrglw     vs12,   vs33,   vs45
+    xxmrglw     vs14,   vs37,   vs41  
+
+    xxmrghw     vs2,    vs37,   vs41
+    xxmrghw     vs3,    vs33,   vs45
+
+    xxlor      vs9, vs8,    vs8
+    xxlor      vs11,    vs10,   vs10 
+ 
+    xxlor      vs13,    vs12,   vs12
+    xxlor      vs15,    vs14,   vs14
+
+    xxperm      vs8,    vs0,    save_permute_1
+    xxperm      vs10,   vs1,    save_permute_1
+    xxperm      vs9,    vs0,    save_permute_2  
+    xxperm      vs11,   vs1,    save_permute_2      
+
+    xxperm     vs12,    vs2,    save_permute_1
+    xxperm     vs14,    vs3,    save_permute_1
+      
+    xxperm     vs13,    vs2,    save_permute_2   
+    xxperm     vs15,    vs3,    save_permute_2      
+
+
+    /* multiply add normal way */
+ 
+#ifdef TRMMKERNEL
+    xvmulsp     vs34,   vs8,    alpha_r 
+    xvmulsp     vs35,   vs12,   alpha_r 
+    xvmulsp     vs38,   vs9,    alpha_r 
+    xvmulsp     vs39,   vs13,   alpha_r 
+    xvmulsp     vs42,   vs10,   alpha_r 
+    xvmulsp     vs43,   vs14,   alpha_r 
+    xvmulsp     vs46,   vs11,   alpha_r 
+    xvmulsp     vs47,   vs15,   alpha_r                    
+#else 
+    xvmaddasp   vs34,   vs8,    alpha_r 
+    xvmaddasp   vs35,   vs12,   alpha_r 
+    xvmaddasp   vs38,   vs9,    alpha_r 
+    xvmaddasp   vs39,   vs13,   alpha_r  
+    xvmaddasp   vs42,   vs10,   alpha_r 
+    xvmaddasp   vs43,   vs14,   alpha_r   
+    xvmaddasp   vs46,   vs11,   alpha_r 
+    xvmaddasp   vs47,   vs15,   alpha_r                     
+#endif     
+ 
+    
+    stxv        vs34, 0(CO)
+    stxv        vs35, 16(CO)  
+    stxv        vs38, 0(T1)
+    stxv        vs39, 16(T1)  
+    stxv        vs42, 0(T2)
+    stxv        vs43, 16(T2)     
+    stxv        vs46, 0(T3)
+    stxv        vs47, 16(T3)  
+  
+
+    addi CO,CO,32
+
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+.macro LOAD4x4_1
+   LOAD4x4 1
+.endm
+
+.macro LOAD4x4_0
+   LOAD4x4 0
+.endm
+
+.macro KERNEL4x4_L1_L4  Index,IsLast
+  KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_2  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I1_L4_3  OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+.macro KERNEL4x4_I1_L2_3  OffsetA,OffsetB, Index,IsLast
+   KERNEL4x4_L1_L2_I AO,BO,0,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro KERNEL4x4_I2_L4_2  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I  \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,0
+.endm
+
+.macro KERNEL4x4_I2_L4_3  AREG,BREG,OffsetA,OffsetB, Index,IsLast
+  KERNEL4x4_L1_L4_I \AREG,\BREG,  \OffsetA,\OffsetB,\Index,\IsLast,1
+.endm
+
+.macro Zero4X4
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+ 
+.endm
+
+.macro LOAD4x4  Zero
+
+    lxv vs0,     0(AO)
+    lxv vs24,   0(BO) 
+
+
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2        
+
+.if \Zero==1 
+    xxlxor      vs32,   vs32,   vs32
+    xxlxor      vs33,   vs33,   vs33 
+    xxlxor      vs34,   vs34,   vs34
+    xxlxor      vs35,   vs35,   vs35
+ 
+.endif
+.endm
+
+.macro END4x4_NORMAL
+  END4x4 0, AO, BO, 16,16 
+.endm
+
+.macro END4x4 First, AREG, BREG, OffsetA, OffsetB
+
+.if \OffsetB != 0 
+    addi        \BREG, \BREG, \OffsetB 
+.endif
+.if \OffsetA != 0 
+    addi        \AREG, \AREG, \OffsetA 
+.endif  
+
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3  
+.else
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+.endif
+.endm  
+
+.macro KERNEL4x4_L1_L4_I  AREG,BREG,   OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+    lxv vs0,    DISP16(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 16+\OffsetB)(\BREG)  
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+ 
+
+    lxv vs4,    DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs26,   DISP16(\Index, 32+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+ 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+
+.if \Complete==0 
+
+    lxv vs0,    DISP16(\Index, 48+\OffsetA)(\AREG)
+    lxv vs24,   DISP16(\Index, 48+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)  
+    addi        \BREG, \BREG,  DISP16(\Index,16*3+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+    addi        \BREG, \BREG,  DISP16(\Index,64)
+
+.endif
+.endif   
+ 
+ 
+.endm
+
+.macro KERNEL4x4 First
+    LOAD4x4 0
+    END4x4 \First, AO, BO, 16,16  
+.endm
+
+.macro KERNEL4x4_L1_L2_I  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
+
+    lxv vs4,    DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
+
+    xxperm      vs6,   vs4,       permute_mask  
+    xxpermdi    vs5,   vs4,   vs4,2      
+    xxpermdi    vs7,   vs6,   vs6,2 
+.if \First==1
+    xvmulsp      vs32,   vs24,   vs0
+    xvmulsp      vs33,   vs24,   vs1 
+    xvmulsp      vs34,   vs24,   vs2
+    xvmulsp      vs35,   vs24,   vs3
+ 
+.else 
+    xvmaddasp      vs32,   vs24,   vs0
+    xvmaddasp      vs33,   vs24,   vs1 
+    xvmaddasp      vs34,   vs24,   vs2
+    xvmaddasp      vs35,   vs24,   vs3
+ 
+.endif
+
+.if \Complete==0 
+
+    lxv vs0,    DISP8(\Index, 16+\OffsetA)(\AREG)
+    lxv vs24,   DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    xxperm      vs2,   vs0,       permute_mask  
+    xxpermdi    vs1,   vs0,   vs0,2      
+    xxpermdi    vs3,   vs2,   vs2,2   
+.endif
+
+.if \First==1
+    xvmulsp      vs32,   vs26,   vs4
+    xvmulsp      vs33,   vs26,   vs5 
+    xvmulsp      vs34,   vs26,   vs6
+    xvmulsp      vs35,   vs26,   vs7 
+
+
+.else
+    xvmaddasp      vs32,   vs26,   vs4
+    xvmaddasp      vs33,   vs26,   vs5 
+    xvmaddasp      vs34,   vs26,   vs6
+    xvmaddasp      vs35,   vs26,   vs7
+ 
+.endif
+ 
+ 
+.if \IsLast==1  
+.if \Complete==1
+    addi        \AREG, \AREG, DISP8(\Index,16+\OffsetA)  
+    addi        \BREG, \BREG,  DISP8(\Index,16+\OffsetB)
+
+.else
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+    addi        \BREG, \BREG,  DISP8(\Index,32)
+
+.endif
+.endif   
+     
+  
+.endm
+
+
+.macro SAVE4x4
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC 
+#if !defined(TRMMKERNEL)  
+  lxv        vs36, 0(CO)
+  lxv        vs37, 0(T1)
+#endif
+  add     T2, CO, T10  
+  add     T3, T1, T10 
+#if !defined(TRMMKERNEL)   
+  lxv        vs38, 0(T2)
+  lxv        vs39, 0(T3)    
+#endif   
+
+  xxmrglw  vs0, vs35,vs32
+  xxmrglw  vs1, vs34,vs33 
+  xxmrglw  vs4, vs32,vs35
+  xxmrglw  vs5, vs33,vs34 
+
+
+  xxmrghw  vs2, vs35,vs32
+  xxmrghw  vs3, vs34,vs33 
+  xxmrghw  vs6, vs32,vs35
+  xxmrghw  vs7, vs33,vs34  
+
+  xxmrgld  vs24, vs1, vs0  
+  xxmrghd  vs25,vs5,vs4 
+
+  xxmrgld  vs26, vs2, vs3  
+  xxmrghd  vs27,vs6,vs7
+
+ #if defined(TRMMKERNEL)
+  xvmulsp        vs36, vs24, alpha_r
+  xvmulsp        vs37, vs25, alpha_r 
+  xvmulsp        vs38, vs26, alpha_r
+  xvmulsp        vs39, vs27, alpha_r 
+#else
+  xvmaddasp        vs36, vs24, alpha_r
+  xvmaddasp        vs37, vs25, alpha_r 
+  xvmaddasp        vs38, vs26, alpha_r
+  xvmaddasp        vs39, vs27, alpha_r   
+ #endif
+  stxv        vs36, 0(CO)
+  stxv        vs37, 0(T1) 
+  stxv        vs38, 0(T2)
+  stxv        vs39, 0(T3)   
+ 
+
+
+  addi CO,CO,16
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=2
+**********************************************************************************************/
+
+ 
+.macro KERNEL4x2_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+
+.macro Zero4x2
+    xxlxor      vs0,   vs0,   vs0 
+    xxlxor      vs2,   vs2,   vs2 
+       
+.endm
+ 
+.macro KERNEL4x2
+  KERNEL4x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG)      
+    xxspltw   vs8,  vs36, 0 
+    xxspltw   vs9,  vs36, 1  
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs2,   vs26,   vs9 
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs2,   vs26,   vs9 
+ 
+ .endif
+   
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+    addi        \BREG, \BREG, DISP4(\Index,16)
+ 
+.endm
+
+.macro KERNEL4x2_I_2  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    lxv vs26,   DISP8(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs28,   DISP8(\Index,16+\OffsetB)(\BREG)       
+    xxspltw   vs8,  vs4, 2  
+    xxspltw   vs9,  vs4, 3 
+    xxspltw   vs10, vs4, 0 
+    xxspltw   vs11, vs4, 1
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs2,   vs26,   vs9  
+
+    xvmulsp      vs0,   vs28,   vs10 
+    xvmulsp      vs2,   vs28,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+    xvmaddasp      vs2,   vs26,   vs9 
+
+    xvmaddasp      vs0,   vs28,   vs10 
+    xvmaddasp      vs2,   vs28,   vs11   
+ .endif
+
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE4x2
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO) 
+  lxssp  v1,4(CO) 
+
+  lxssp  v2,0(T1)
+  lxssp  v3,4(T1)
+
+  lxssp  v4,0(T2)
+  lxssp  v5,4(T2)
+
+  lxssp  v6,0(T3)
+  lxssp  v7,4(T3)
+
+   
+#endif
+  xscvspdp  vs5, vs2
+  xxspltw   vs6, vs2, 1 
+  xxspltw   vs7, vs2, 2 
+  xxspltw   vs8, vs2, 3  
+  xscvspdp  vs6,vs6
+  xscvspdp  vs7,vs7
+  xscvspdp  vs8,vs8
+
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+ 
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs8, vs4 
+  xsmuldp  vs33,vs27, vs4 
+
+  xsmuldp  vs34,vs7, vs4 
+  xsmuldp  vs35,vs26, vs4 
+
+  xsmuldp  vs36,vs6, vs4 
+  xsmuldp  vs37,vs25, vs4  
+
+  xsmuldp  vs38,vs5, vs4 
+  xsmuldp  vs39,vs24, vs4  
+
+      
+#else
+  xsmaddadp  vs32,vs8, vs4 
+  xsmaddadp  vs33,vs27, vs4 
+
+  xsmaddadp  vs34,vs7, vs4 
+  xsmaddadp  vs35,vs26, vs4 
+
+  xsmaddadp  vs36,vs6, vs4 
+  xsmaddadp  vs37,vs25, vs4  
+
+  xsmaddadp  vs38,vs5, vs4 
+  xsmaddadp  vs39,vs24, vs4  
+
+    
+#endif  
+
+  stxssp  v0,0(CO) 
+  stxssp  v1,4(CO) 
+
+  stxssp  v2,0(T1)
+  stxssp  v3,4(T1)
+
+  stxssp  v4,0(T2)
+  stxssp  v5,4(T2)
+
+  stxssp  v6,0(T3)
+  stxssp  v7,4(T3)
+
+ 
+ 
+
+  addi CO,CO,8
+.endm
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=1
+**********************************************************************************************/
+.macro KERNEL4x1_4   OffsetA,OffsetB, Index,IsLast
+  KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro Zero4x1
+    xxlxor      vs0,   vs0,   vs0 
+.endm
+
+.macro KERNEL4x1
+  KERNEL4x1_1 AO,BO, 0 
+.endm
+
+.macro KERNEL4x1_2
+  KERNEL4x1_2_1 AO,BO, 0 
+.endm
+
+.macro KERNEL4x1_1 AREG,BREG,First 
+    lxvwsx vs8,  0, \AREG
+    lxv vs26,   0(\BREG)       
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8  
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+ .endif
+    addi        \AREG, \AREG,  4  
+    addi        \BREG, \BREG,  16
+.endm
+
+.macro KERNEL4x1_2_1 AREG,BREG,First 
+    lxsd v4,    0(\AREG)
+    lxv vs26,   0(\BREG)      
+    lxv vs28,  16(\BREG) 
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0  
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs0,   vs28,   vs9     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9  
+ .endif
+    addi        \AREG, \AREG,  8 
+    addi        \BREG, \BREG,  32
+.endm
+
+.macro KERNEL4x1_I_4  AREG,BREG,First,OffsetA,OffsetB, Index,IsLast  
+    lxv vs4,    DISP4(\Index, 0+\OffsetA)(\AREG)
+    xxspltw   vs8,  vs4, 3 
+    xxspltw   vs9,  vs4, 2 
+    xxspltw   vs10, vs4, 1 
+    xxspltw   vs11, vs4, 0
+    lxv vs26,   DISP16(\Index, 0+\OffsetB)(\BREG) 
+    lxv vs28,   DISP16(\Index,16+\OffsetB)(\BREG)  
+    lxv vs30,   DISP16(\Index,32+\OffsetB)(\BREG) 
+    lxv vs32,   DISP16(\Index,48+\OffsetB)(\BREG)          
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8  
+    xvmulsp      vs0,   vs28,   vs9      
+    xvmulsp      vs0,   vs30,   vs10  
+    xvmulsp      vs0,   vs32,   vs11     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8  
+    xvmaddasp      vs0,   vs28,   vs9     
+    xvmaddasp      vs0,   vs30,   vs10  
+    xvmaddasp      vs0,   vs32,   vs11  
+ .endif
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+    addi        \BREG, \BREG, DISP16(\Index,64)
+.endif 
+.endm
+
+.macro SAVE4x1
+  slwi    T10, LDC ,   1 
+  add     T1, CO, LDC  
+  add     T2, CO, T10  
+  add     T3, T1, T10     
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs4,alpha_r
+/* v0 corresponds to vs32, do not forget*/
+#if !defined(TRMMKERNEL)
+  lxssp  v0,0(CO)  
+  lxssp  v2,0(T1) 
+  lxssp  v4,0(T2) 
+  lxssp  v6,0(T3)  
+#endif
+  xscvspdp  vs24, vs0
+  xxspltw   vs25, vs0, 1 
+  xxspltw   vs26, vs0, 2 
+  xxspltw   vs27, vs0, 3  
+  xscvspdp  vs25,vs25
+  xscvspdp  vs26,vs26
+  xscvspdp  vs27,vs27
+
+#if defined(TRMMKERNEL)
+  xsmuldp  vs32,vs27, vs4 
+  xsmuldp  vs34,vs26, vs4 
+  xsmuldp  vs36,vs25, vs4 
+  xsmuldp  vs38,vs24, vs4  
+#else
+  xsmaddadp  vs32,vs27, vs4 
+  xsmaddadp  vs34,vs26, vs4 
+  xsmaddadp  vs36,vs25, vs4 
+  xsmaddadp  vs38,vs24, vs4   
+#endif  
+  stxssp  v0,0(CO)  
+  stxssp  v2,0(T1) 
+  stxssp  v4,0(T2) 
+  stxssp  v6,0(T3)  
+  addi CO,CO,4
+.endm
+
+/****************************N=2 section*****************/
+
+.macro KERNEL2x16_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero2x16
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+    xxlxor      vs6,   vs6,   vs6
+    xxlxor      vs7,   vs7,   vs7      
+.endm
+ 
+.macro KERNEL2x16
+  KERNEL2x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
+    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs28,   vs8
+    xvmulsp      vs3,   vs29,   vs8 
+
+    xvmulsp      vs4,   vs26,   vs9
+    xvmulsp      vs5,   vs27,   vs9 
+    xvmulsp      vs6,   vs28,   vs9
+    xvmulsp      vs7,   vs29,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
+
+    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs2,   vs18,   vs10
+    xvmaddasp      vs3,   vs19,   vs10 
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+    xvmaddasp      vs6,   vs18,   vs11
+    xvmaddasp      vs7,   vs19,   vs11  
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs31,   vs12 
+    xvmaddasp      vs2,   vs32,   vs12
+    xvmaddasp      vs3,   vs33,   vs12 
+
+    xvmaddasp      vs4,   vs30,   vs13
+    xvmaddasp      vs5,   vs31,   vs13 
+    xvmaddasp      vs6,   vs32,   vs13
+    xvmaddasp      vs7,   vs33,   vs13 
+
+    xvmaddasp      vs0,   vs34,   vs14
+    xvmaddasp      vs1,   vs35,   vs14 
+    xvmaddasp      vs2,   vs36,   vs14
+    xvmaddasp      vs3,   vs37,   vs14 
+
+    xvmaddasp      vs4,   vs34,   vs15
+    xvmaddasp      vs5,   vs35,   vs15 
+    xvmaddasp      vs6,   vs36,   vs15
+    xvmaddasp      vs7,   vs37,   vs15    
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP64(\Index,256)
+.endif 
+  
+.endm
+
+.macro KERNEL2x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
+    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+    xvmaddasp      vs6,   vs28,   vs9
+    xvmaddasp      vs7,   vs29,   vs9 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs2,   vs18,   vs10
+    xvmaddasp      vs3,   vs19,   vs10 
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+    xvmaddasp      vs6,   vs18,   vs11
+    xvmaddasp      vs7,   vs19,   vs11   
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x16
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO) 
+    lxv        vs18, 32(CO)  
+    lxv        vs19, 48(CO)      
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1)
+    lxv        vs27, 16(T1) 
+    lxv        vs28, 32(T1)  
+    lxv        vs29, 48(T1)      
+#endif
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r 
+  xvmulsp        vs18, vs2, alpha_r
+  xvmulsp        vs19, vs3, alpha_r   
+  xvmulsp        vs26, vs4, alpha_r
+  xvmulsp        vs27, vs5, alpha_r 
+  xvmulsp        vs28, vs6, alpha_r
+  xvmulsp        vs29, vs7, alpha_r
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r 
+  xvmaddasp        vs18, vs2, alpha_r
+  xvmaddasp        vs19, vs3, alpha_r   
+  xvmaddasp        vs26, vs4, alpha_r
+  xvmaddasp        vs27, vs5, alpha_r 
+  xvmaddasp        vs28, vs6, alpha_r
+  xvmaddasp        vs29, vs7, alpha_r
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+    stxv        vs18, 32(CO)  
+    stxv        vs19, 48(CO)      
+    
+    stxv        vs26, 0(T1)
+    stxv        vs27, 16(T1) 
+    stxv        vs28, 32(T1)  
+    stxv        vs29, 48(T1) 
+ 
+  addi CO,CO,64
+
+.endm
+
+/*       M=8 N=2 */
+
+.macro KERNEL2x8_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero2x8
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+ 
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+     
+.endm
+ 
+.macro KERNEL2x8
+  KERNEL2x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)          
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+
+    xvmulsp      vs4,   vs26,   vs9
+    xvmulsp      vs5,   vs27,   vs9      
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8   
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9  
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP32(\Index, 96+ 16+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10 
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs31,   vs12 
+    xvmaddasp      vs4,   vs30,   vs13
+    xvmaddasp      vs5,   vs31,   vs13 
+
+    xvmaddasp      vs0,   vs34,   vs14
+    xvmaddasp      vs1,   vs35,   vs14 
+    xvmaddasp      vs4,   vs34,   vs15
+    xvmaddasp      vs5,   vs35,   vs15 
+   
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+.macro KERNEL2x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)
+    lxv vs16,   DISP16(\Index,32+\OffsetA)(\AREG)
+    lxv vs17,   DISP16(\Index,48+\OffsetA)(\AREG)      
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+
+    xvmaddasp      vs4,   vs26,   vs9
+    xvmaddasp      vs5,   vs27,   vs9  
+
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs17,   vs10   
+
+    xvmaddasp      vs4,   vs16,   vs11
+    xvmaddasp      vs5,   vs17,   vs11     
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x8
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1)
+    lxv        vs27, 16(T1) 
+    
+#endif
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r  
+  xvmulsp        vs26, vs4, alpha_r
+  xvmulsp        vs27, vs5, alpha_r 
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r  
+  xvmaddasp        vs26, vs4, alpha_r
+  xvmaddasp        vs27, vs5, alpha_r 
+#endif
+
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+     
+    
+    stxv        vs26, 0(T1)
+    stxv        vs27, 16(T1) 
+
+  addi CO,CO,32
+
+.endm
+
+
+/*M=4*/
+
+
+.macro KERNEL2x4_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ /* we will aggregate on save vs0 +vs4 vs11+vs5 */
+.macro Zero2x4
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+ 
+    xxlxor      vs4,   vs4,   vs4
+    xxlxor      vs5,   vs5,   vs5 
+    
+.endm
+ 
+.macro KERNEL2x4
+  KERNEL2x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1 
+    xxspltw   vs9,  vs36, 0 
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+    xvmulsp      vs1,   vs26,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+    xvmaddasp      vs1,   vs26,   vs9 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs39,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP16(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs34,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
+ 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0  
+
+    xxspltw   vs12,  vs39, 3  
+    xxspltw   vs13,  vs39, 2 
+    xxspltw   vs14, vs39, 1 
+    xxspltw   vs15, vs39, 0  
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs4,   vs16,   vs10
+    xvmaddasp      vs5,   vs16,   vs11 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs12
+    xvmaddasp      vs1,   vs30,   vs13 
+    xvmaddasp      vs4,   vs34,   vs14
+    xvmaddasp      vs5,   vs34,   vs15 
+ 
+   
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+.macro KERNEL2x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs36,    DISP4(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 3  
+    xxspltw   vs9,  vs36, 2 
+    xxspltw   vs10, vs36, 1 
+    xxspltw   vs11, vs36, 0    
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\AREG)      
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs4,   vs16,   vs10
+    xvmaddasp      vs5,   vs16,   vs11     
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE2x4
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxv        vs26, 0(T1) 
+    
+#endif
+    /*aggregate vectors*/
+  xvaddsp         vs0,vs0,vs4
+  xvaddsp         vs1,vs1,vs5 
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r 
+  xvmulsp        vs26, vs1, alpha_r 
+#else
+  xvmaddasp        vs16, vs0, alpha_r 
+  xvmaddasp        vs26, vs1, alpha_r 
+#endif
+
+  stxv        vs16, 0(CO) 
+  stxv        vs26, 0(T1)  
+
+  addi CO,CO,16
+
+.endm
+
+
+/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2  */
+.macro SWITCH_PERMUTE_INNER
+    xxpermdi	permute_mask,	permute_mask,	permute_mask,2
+.endm
+
+.macro Zero2x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    SWITCH_PERMUTE_INNER
+.endm
+ 
+.macro KERNEL2x2
+  KERNEL2x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxsd v4,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxperm   vs9,  vs36, permute_mask 
+    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs37,   vs36 
+    xvmulsp      vs1,   vs37,   vs9     
+     
+.else 
+    xvmaddasp      vs0,   vs37,   vs36 
+    xvmaddasp      vs1,   vs37,   vs9 
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP2(\Index,8)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs10,    DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs16,   DISP8(\Index,16+\OffsetA)(\AREG) 
+
+ 
+    xxperm   vs9,  vs8, permute_mask   
+    xxperm   vs11, vs10, permute_mask  
+
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9 
+    xvmaddasp      vs0,   vs16,   vs10
+    xvmaddasp      vs1,   vs16,   vs11 
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+.macro KERNEL2x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\BREG)  
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG) 
+
+ 
+    xxperm   vs9,  vs8, permute_mask    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs26,   vs9  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP4(\Index,16)
+.endif 
+.endm
+
+
+.macro SAVE2x2
+
+#ifndef TRMMKERNEL    
+    lxsd v4   , 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxsd v5   , 0(T1) 
+    
+#endif
+    /*aggregate vectors*/
+  xxpermdi         vs4,vs0,vs0,2
+  xxpermdi         vs5,vs1,vs1,2  
+  xvaddsp          vs0,vs0,vs4
+  xvaddsp         vs1,vs1,vs5 
+  /*   */
+  /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10}  */
+  xxperm    vs1,vs1, permute_mask
+
+
+  xxmrghw   vs2 ,vs1,vs0
+  xxpermdi         vs2,vs2,vs2,2  
+  xxmrghw   vs3 ,vs0,vs1  
+#if defined(TRMMKERNEL)
+  xvmulsp        vs36, vs2, alpha_r 
+  xvmulsp        vs37, vs3, alpha_r 
+#else
+  xvmaddasp        vs36, vs2, alpha_r 
+  xvmaddasp        vs37, vs3, alpha_r 
+#endif
+  /**** store last two words*/
+
+
+  stxsd       v4, 0(CO) 
+  stxsd        v5, 0(T1)  
+
+  addi CO,CO,8
+
+.endm
+
+/*--------------------------- M=1 N=2 */
+.macro Zero2x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor    vs2,vs2,vs2 
+    xxlxor    vs3,vs3,vs3     
+.endm
+ 
+.macro KERNEL2x1
+  KERNEL2x1_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL2x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\BREG)
+    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\BREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs2,   vs37,   vs35 
+    xvmulsp      vs3,   vs37,   vs36     
+     
+.else 
+    xsmaddadp     vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+ .endif
+   
+    addi        \BREG, \BREG, DISP2(\Index,8)
+    addi        \AREG, \AREG, DISP1(\Index,4)  
+ 
+.endm
+
+
+
+
+.macro KERNEL2x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\BREG)
+    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)
+   
+    xxmrglw   vs5, vs26,vs26
+    xxmrghw   vs6, vs26,vs26 
+ 
+    xvmaddasp      vs0,   vs8,   vs5
+    xvmaddasp      vs1,   vs10,   vs6 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP8(\Index,32)  
+    addi        \AREG, \AREG, DISP4(\Index,16)
+.endif 
+  
+.endm
+
+.macro KERNEL2x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\BREG)
+    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\BREG) 
+    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\BREG)
+    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\BREG)    
+    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\AREG)        
+    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\AREG)  
+ 
+ 
+    xsmaddadp      vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+
+    xsmaddadp      vs2,   vs38,   vs39 
+    xsmaddadp      vs3,   vs38,   vs40      
+ 
+   
+    addi        \BREG, \BREG, DISP4(\Index,16)
+    addi        \AREG, \AREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE2x1
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)     
+#endif
+  add     T1, CO, LDC 
+#ifndef TRMMKERNEL    
+    lxssp v5   , 0(T1) 
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors 2x2_4   */ 
+      xxpermdi         vs4,vs0,vs0,2
+      xxpermdi         vs5,vs1,vs1,2  
+      xvaddsp          vs0,vs0,vs4
+      xvaddsp         vs1,vs1,vs5 
+      xvaddsp         vs0,vs0,vs1 
+/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs2,vs2,vs6
+  xsadddp  vs3,vs3,vs5  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs2, vs16 
+  xsmuldp  vs37,vs3, vs16  
+ 
+#else
+  xsmaddadp  vs36,vs2, vs16 
+  xsmaddadp  vs37,vs3, vs16 
+#endif  
+
+  stxssp       v4, 0(CO) 
+  stxssp        v5, 0(T1)  
+
+  addi CO,CO,4
+
+.endm
+
+
+
+/****************************N=1 section*****************/
+
+.macro KERNEL1x16_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x16_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x16
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3       
+.endm
+ 
+.macro KERNEL1x16
+  KERNEL1x16_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x16_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)   
+    lxv vs28,   DISP16(\Index, 32+\OffsetA)(\AREG)
+    lxv vs29,   DISP16(\Index,48+\OffsetA)(\AREG)        
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8 
+    xvmulsp      vs2,   vs28,   vs8
+    xvmulsp      vs3,   vs29,   vs8 
+  
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP16(\Index,64)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x16_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP64(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP64(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP64(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP64(\Index,48+\OffsetA)(\AREG)  
+
+    lxv vs16,   DISP64(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP64(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP64(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP64(\Index,64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP64(\Index,128+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP64(\Index,128+ 16+\OffsetA)(\AREG)
+    lxv vs32,   DISP64(\Index,128+ 32+\OffsetA)(\AREG)
+    lxv vs33,   DISP64(\Index,128+ 48+\OffsetA)(\AREG)  
+        
+    lxv vs34,   DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
+    lxv vs36,   DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
+    lxv vs37,   DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG) 
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs9
+    xvmaddasp      vs1,   vs17,   vs9 
+    xvmaddasp      vs2,   vs18,   vs9
+    xvmaddasp      vs3,   vs19,   vs9 
+ 
+
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10 
+    xvmaddasp      vs2,   vs32,   vs10
+    xvmaddasp      vs3,   vs33,   vs10 
+ 
+
+    xvmaddasp      vs0,   vs34,   vs11
+    xvmaddasp      vs1,   vs35,   vs11 
+    xvmaddasp      vs2,   vs36,   vs11
+    xvmaddasp      vs3,   vs37,   vs11 
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP64(\Index,256)
+.endif 
+  
+.endm
+
+.macro KERNEL1x16_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG)
+    lxv vs28,   DISP32(\Index,32+\OffsetA)(\AREG)
+    lxv vs29,   DISP32(\Index,48+\OffsetA)(\AREG)      
+    lxv vs16,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)
+    lxv vs18,   DISP32(\Index,64+ 32+\OffsetA)(\AREG)
+    lxv vs19,   DISP32(\Index,64+ 48+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8 
+    xvmaddasp      vs2,   vs28,   vs8
+    xvmaddasp      vs3,   vs29,   vs8 
+ 
+
+    xvmaddasp      vs0,   vs16,   vs9
+    xvmaddasp      vs1,   vs17,   vs9 
+    xvmaddasp      vs2,   vs18,   vs9
+    xvmaddasp      vs3,   vs19,   vs9 
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x16
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO) 
+    lxv        vs18, 32(CO)  
+    lxv        vs19, 48(CO)      
+#endif
+ 
+
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r 
+  xvmulsp        vs18, vs2, alpha_r
+  xvmulsp        vs19, vs3, alpha_r   
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r 
+  xvmaddasp        vs18, vs2, alpha_r
+  xvmaddasp        vs19, vs3, alpha_r   
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO) 
+    stxv        vs18, 32(CO)  
+    stxv        vs19, 48(CO)      
+    
+  addi CO,CO,64
+
+.endm
+
+/*       M=8 N=1 */
+
+.macro KERNEL1x8_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x8
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3          
+.endm
+ 
+.macro KERNEL1x8
+  KERNEL1x8_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x8_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)         
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8
+    xvmulsp      vs1,   vs27,   vs8  
+  
+     
+.else 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP8(\Index,32)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x8_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP32(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP32(\Index,16+\OffsetA)(\AREG) 
+
+    lxv vs16,   DISP32(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP32(\Index,32+ 16+\OffsetA)(\AREG) 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP32(\Index,64+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP32(\Index,64+ 16+\OffsetA)(\AREG)   
+        
+    lxv vs34,   DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
+    lxv vs35,   DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)  
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+
+    xvmaddasp      vs2,   vs16,   vs9
+    xvmaddasp      vs3,   vs17,   vs9  
+ 
+
+    xvmaddasp      vs0,   vs30,   vs10
+    xvmaddasp      vs1,   vs31,   vs10  
+ 
+
+    xvmaddasp      vs2,   vs34,   vs11
+    xvmaddasp      vs3,   vs35,   vs11  
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP32(\Index,128)
+.endif 
+  
+.endm
+
+.macro KERNEL1x8_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG)     
+    lxv vs16,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs17,   DISP16(\Index,32+ 16+\OffsetA)(\AREG) 
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs8  
+ 
+
+    xvmaddasp      vs2,   vs16,   vs9
+    xvmaddasp      vs3,   vs17,   vs9   
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x8
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)
+    lxv        vs17, 16(CO)       
+#endif
+   /* aggregate vs0 vs2 and vs1 vs3*/
+  xvaddsp vs0,vs0,vs2
+  xvaddsp  vs1,vs1,vs3
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r
+  xvmulsp        vs17, vs1, alpha_r     
+#else
+  xvmaddasp        vs16, vs0, alpha_r
+  xvmaddasp        vs17, vs1, alpha_r  
+#endif
+    stxv        vs16, 0(CO)
+    stxv        vs17, 16(CO)      
+    
+  addi CO,CO,32
+
+.endm
+/*M=4*/
+
+.macro KERNEL1x4_2   OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+ 
+.macro Zero1x4
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1  
+    xxlxor      vs2,   vs2,   vs2
+    xxlxor      vs3,   vs3,   vs3          
+.endm
+ 
+.macro KERNEL1x4
+  KERNEL1x4_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x4_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ 
+.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v4,   DISP1(\Index, 0+\OffsetB)(\BREG)
+    xscvdpspn   vs36,vs36
+    xxspltw     vs8,  vs36, 0
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\AREG)         
+ 
+ 
+.if \First==1
+    xvmulsp      vs0,   vs26,   vs8 
+.else 
+    xvmaddasp      vs0,   vs26,   vs8 
+ 
+ .endif
+   
+    addi        \BREG, \BREG, DISP1(\Index,4)
+    addi        \AREG, \AREG, DISP4(\Index,16)  
+ 
+.endm
+
+
+
+
+.macro KERNEL1x4_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs38,    DISP4(\Index, 0+\OffsetB)(\BREG) 
+
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP16(\Index,16+\OffsetA)(\AREG) 
+ 
+
+    xxspltw   vs8,  vs38, 3  
+    xxspltw   vs9,  vs38, 2 
+
+    lxv vs30,   DISP16(\Index,32+ 0+\OffsetA)(\AREG)
+    lxv vs31,   DISP16(\Index,32+ 16+\OffsetA)(\AREG)   
+          
+
+    xxspltw   vs10, vs38, 1 
+    xxspltw   vs11, vs38, 0    
+
+ 
+    xvmaddasp      vs0,   vs26,   vs8 
+
+    xvmaddasp      vs1,   vs27,   vs9 
+
+    xvmaddasp      vs2,   vs30,   vs10   
+ 
+
+    xvmaddasp      vs3,   vs31,   vs11   
+
+ 
+ 
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP4(\Index,16)  
+    addi        \AREG, \AREG, DISP16(\Index,64)
+.endif 
+  
+.endm
+
+.macro KERNEL1x4_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\BREG)
+    xxspltw   vs8,  vs36, 1  
+    xxspltw   vs9,  vs36, 0      
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\AREG)
+    lxv vs27,   DISP8(\Index,16+\OffsetA)(\AREG)      
+ 
+ 
+    xvmaddasp      vs0,   vs26,   vs8
+    xvmaddasp      vs1,   vs27,   vs9
+  
+ 
+.if \IsLast==1   
+    addi        \BREG, \BREG, DISP2(\Index,8)  
+    addi        \AREG, \AREG, DISP8(\Index,32)
+.endif 
+  
+.endm
+
+
+.macro SAVE1x4
+
+#ifndef TRMMKERNEL    
+    lxv        vs16, 0(CO)       
+#endif
+   /* aggregate */
+  xvaddsp vs0,vs0,vs2
+  xvaddsp  vs1,vs1,vs3
+  xvaddsp  vs0,vs1,vs0
+#if defined(TRMMKERNEL)
+  xvmulsp        vs16, vs0, alpha_r     
+#else
+  xvmaddasp        vs16, vs0, alpha_r  
+#endif
+    stxv        vs16, 0(CO)      
+    
+  addi CO,CO,16
+
+.endm
+
+/* M=2 N=1*/ 
+.macro Zero1x2
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor    vs2,vs2,vs2 
+    xxlxor    vs3,vs3,vs3     
+.endm
+ 
+.macro KERNEL1x2
+  KERNEL1x2_1 AO,BO, 0, 0,0,0
+.endm
+.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x2_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x2_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone then will add it to batched ones
+ */
+.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP2(\Index, 0+\OffsetB)(\AREG)
+    lxssp v4,   DISP2(\Index, 4+\OffsetB)(\AREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
+ 
+ 
+.if \First==1
+    xvmuldp      vs2,   vs37,   vs35 
+    xvmuldp      vs3,   vs37,   vs36     
+     
+.else 
+    xsmaddadp     vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+ .endif
+   
+    addi        \AREG, \AREG,  DISP2(\Index,8) 
+    addi        \BREG, \BREG, DISP1(\Index,4) 
+ 
+.endm
+
+
+
+
+.macro KERNEL1x2_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG)
+    lxv vs10,   DISP8(\Index, 16+\OffsetB)(\AREG) 
+
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG)
+   
+    xxmrglw   vs5, vs26,vs26
+    xxmrghw   vs6, vs26,vs26 
+ 
+    xvmaddasp      vs0,   vs8,   vs5
+    xvmaddasp      vs1,   vs10,   vs6 
+ 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP8(\Index,32)
+    addi        \BREG, \BREG,  DISP4(\Index,16)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x2_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxssp v3,   DISP4(\Index, 0+\OffsetB)(\AREG)
+    lxssp v4,   DISP4(\Index, 4+\OffsetB)(\AREG) 
+    lxssp v7,   DISP4(\Index, 8+\OffsetB)(\AREG)
+    lxssp v8,   DISP4(\Index, 12+\OffsetB)(\AREG)    
+    lxssp v5,   DISP2(\Index, 0+\OffsetA)(\BREG)        
+    lxssp v6,   DISP2(\Index, 4+\OffsetA)(\BREG)  
+ 
+ 
+    xsmaddadp      vs2,   vs37,   vs35
+    xsmaddadp      vs3,   vs37,   vs36
+
+    xsmaddadp      vs2,   vs38,   vs39 
+    xsmaddadp      vs3,   vs38,   vs40      
+ 
+   
+    addi        \AREG, \AREG, DISP4(\Index,16)
+    addi        \BREG, \BREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE1x2
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)      
+    lxssp v5   , 4(CO) 
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors 1x2_4   */ 
+      xxpermdi         vs4,vs0,vs0,2
+      xxpermdi         vs5,vs1,vs1,2  
+      xvaddsp          vs0,vs0,vs4
+      xvaddsp         vs1,vs1,vs5 
+      xvaddsp         vs0,vs0,vs1 
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs2,vs2,vs6
+  xsadddp  vs3,vs3,vs5  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs2, vs16 
+  xsmuldp  vs37,vs3, vs16  
+ 
+#else
+  xsmaddadp  vs36,vs2, vs16 
+  xsmaddadp  vs37,vs3, vs16 
+#endif  
+
+  stxssp       v4, 0(CO) 
+  stxssp        v5, 4(CO)  
+
+  addi CO,CO,8
+
+.endm
+/*///////////////// N=1 M=1 //////////////////*/
+.macro Zero1x1
+    xxlxor      vs0,   vs0,   vs0
+    xxlxor      vs1,   vs1,   vs1 
+    xxlxor      vs2, vs2,vs2 
+    xxlxor      vs3,vs3,vs3 
+    xxlxor      vs4,vs4,vs4       
+.endm
+ 
+.macro KERNEL1x1
+  KERNEL1x1_1 AO,BO, 1, 0,0,0
+.endm
+
+.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_16 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_8 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_4 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+
+.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast  
+  KERNEL1x1_I_2 AO,BO,  \OffsetA,\OffsetB,\Index,\IsLast
+.endm
+ /*
+   we will calculate 1 alone ( FIRST==1 to zero vs4) 
+ */
+.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
+
+
+    lxssp v3,   DISP1(\Index, 0+\OffsetB)(\AREG) 
+    lxssp v5,   DISP1(\Index, 0+\OffsetA)(\BREG)        
+ 
+ 
+.if \First==1
+    xvmuldp      vs4,   vs37,   vs35       
+     
+.else 
+    xsmaddadp     vs4,   vs37,   vs35 
+ .endif
+   
+    addi        \AREG, \AREG,  DISP1(\Index,4) 
+    addi        \BREG, \BREG, DISP1(\Index,4) 
+ 
+.endm
+
+
+.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP16(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs9,    DISP16(\Index, 16+\OffsetB)(\AREG) 
+    lxv vs10,   DISP16(\Index, 32+0+\OffsetB)(\AREG) 
+    lxv vs11,   DISP16(\Index, 32+ 16+\OffsetB)(\AREG)        
+    lxv vs26,   DISP16(\Index, 0+\OffsetA)(\BREG) 
+    lxv vs16,   DISP16(\Index, 16+\OffsetA)(\BREG) 
+    lxv vs17,   DISP16(\Index, 32+0+\OffsetA)(\BREG) 
+    lxv vs18,   DISP16(\Index, 32+16+\OffsetA)(\BREG)     
+    xvmaddasp      vs0,   vs8,   vs26 
+    xvmaddasp      vs1,   vs9,   vs16  
+    xvmaddasp      vs2,   vs10,  vs17 
+    xvmaddasp      vs3,   vs11,  vs18
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP16(\Index,64)
+    addi        \BREG, \BREG,  DISP16(\Index,64)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x1_I_8  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP8(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs9,    DISP8(\Index, 16+\OffsetB)(\AREG)     
+    lxv vs26,   DISP8(\Index, 0+\OffsetA)(\BREG) 
+    lxv vs16,   DISP8(\Index, 16+\OffsetA)(\BREG) 
+    xvmaddasp      vs0,   vs8,   vs26 
+    xvmaddasp      vs1,   vs9,   vs16 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP8(\Index,32)
+    addi        \BREG, \BREG,  DISP8(\Index,32)  
+.endif 
+  
+.endm
+
+
+.macro KERNEL1x1_I_4  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxv vs8,    DISP4(\Index, 0+\OffsetB)(\AREG) 
+    lxv vs26,   DISP4(\Index, 0+\OffsetA)(\BREG) 
+ 
+    xvmaddasp      vs0,   vs8,   vs26 
+ 
+ 
+.if \IsLast==1   
+    addi        \AREG, \AREG, DISP4(\Index,16)
+    addi        \BREG, \BREG,  DISP4(\Index,16)  
+.endif 
+  
+.endm
+
+.macro KERNEL1x1_I_2  AREG,BREG, OffsetA,OffsetB, Index,IsLast  
+
+    lxsd v4,    DISP2(\Index, 0+\OffsetB)(\AREG) 
+    lxsd v5,   DISP2(\Index, 0+\OffsetA)(\BREG) 
+ 
+    xvmaddasp      vs0,   vs36,   vs37 
+ 
+    addi        \AREG, \AREG, DISP2(\Index,8)
+    addi        \BREG, \BREG, DISP2(\Index,8) 
+.endm
+
+
+.macro SAVE1x1
+
+#ifndef TRMMKERNEL    
+    lxssp v4   , 0(CO)    
+    
+#endif
+
+  /*convert alpha_r for multiply*/
+  xscvspdp  vs16,alpha_r
+
+ /*aggregate vectors   */ 
+      xvaddsp          vs0,vs0,vs1
+      xvaddsp          vs2,vs2,vs3
+      xvaddsp          vs0,vs0,vs2
+
+      xxpermdi         vs7,vs0,vs0,2   
+      xvaddsp          vs0,vs0,vs7 
+/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
+  xscvspdp  vs5, vs0
+  xxspltw   vs6, vs0, 1  
+  xscvspdp  vs6,vs6 
+  xsadddp  vs7,vs5,vs6
+  xsadddp  vs4,vs4,vs7  
+
+  /**** store last two words*/
+#if defined(TRMMKERNEL) 
+  xsmuldp  vs36,vs4, vs16   
+ 
+#else
+  xsmaddadp  vs36,vs4, vs16   
+#endif  
+
+  stxssp       v4, 0(CO)    
+
+  addi CO,CO,4
+
+.endm
+
+
+
+
+/****************************TRMM POINTER REFRESH MACROSES*************************/
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	6			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	5			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	4			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	3			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	2			 
+		.endif
+.endm
+
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+
+    #endif
+
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
 .endm
\ No newline at end of file
diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c
index 5dfb18f5b..f5c1ba729 100644
--- a/kernel/power/sgemv_n.c
+++ b/kernel/power/sgemv_n.c
@@ -1,470 +1,470 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#if !defined(__VEC__) || !defined(__ALTIVEC__)
-#include "../arm/gemv_n.c"
-
-#else
-
-#include "common.h"
-
-#define NBMAX 4096
-
-static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
-{
-
-    BLASLONG i;
-	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
-    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
-	a0 = ap[0];
-	a1 = ap[1];
-	a2 = ap[2];
-	a3 = ap[3]; 
-    b0 = a0 + lda4 ;
-	b1 = a1 + lda4 ;
-	b2 = a2 + lda4 ;
-	b3 = a3 + lda4 ;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    x4 = xo[4] * *alpha;
-    x5 = xo[5] * *alpha;
-    x6 = xo[6] * *alpha;
-    x7 = xo[7] * *alpha;
-    __vector float* va0 = (__vector float*)a0;
-    __vector float* va1 = (__vector float*)a1;
-    __vector float* va2 = (__vector float*)a2;
-    __vector float* va3 = (__vector float*)a3;
-    __vector float* vb0 = (__vector float*)b0;
-    __vector float* vb1 = (__vector float*)b1;
-    __vector float* vb2 = (__vector float*)b2;
-    __vector float* vb3 = (__vector float*)b3; 
-    
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1};
-    __vector float   v_x2 = {x2,x2,x2,x2};
-    __vector float   v_x3 = {x3,x3,x3,x3};
-    __vector float   v_x4 = {x4,x4,x4,x4};
-    __vector float   v_x5 = {x5,x5,x5,x5};
-    __vector float   v_x6 = {x6,x6,x6,x6};
-    __vector float   v_x7 = {x7,x7,x7,x7};
-    __vector float* v_y =(__vector float*)y;   
- 
-    for ( i=0; i< n/4; i++)
-    {
-        register __vector float vy=v_y[i];
-        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ; 
-        vy  += v_x4 * vb0[i]   +  v_x5 * vb1[i]   + v_x6 * vb2[i]   + v_x7 * vb3[i] ;
-        v_y[i] =vy;  
-    }
-
-}
-	 
-static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    BLASLONG i;
-    FLOAT x0,x1,x2,x3;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1};
-    __vector float   v_x2 = {x2,x2,x2,x2};
-    __vector float   v_x3 = {x3,x3,x3,x3};
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap[0];
-    __vector float* va1 = (__vector float*)ap[1];
-    __vector float* va2 = (__vector float*)ap[2];
-    __vector float* va3 = (__vector float*)ap[3]; 
- 
-    for ( i=0; i< n/4; i++ )
-    {
-        register __vector float vy=v_y[i];
-        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ;  
-        v_y[i] =vy;     
-    }
-
-} 
-
-static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
-    BLASLONG i;
-    FLOAT x0,x1;
-    x0 = x[0] * *alpha;
-    x1 = x[1] * *alpha; 
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1}; 
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap[0];
-    __vector float* va1 = (__vector float*)ap[1]; 
- 
-    for ( i=0; i< n/4; i++ )
-    { 
-        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;     
-    }
-
-} 
- 
- 
-static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
-    BLASLONG i;
-    FLOAT x0 ;
-    x0 = x[0] * *alpha; 
-    __vector float   v_x0 = {x0,x0,x0,x0}; 
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap; 
- 
-    for ( i=0; i< n/4; i++ )
-    { 
-        v_y[i]   += v_x0 * va0[i]  ;        
-    }
-
-}
- 
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
-    BLASLONG i;
-        
-    for ( i=0; i<n; i++ ){
-            *dest += *src;
-            src++;
-            dest += inc_dest;
-    }
-    return;
-     
-
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-	BLASLONG i;
-	FLOAT *a_ptr;
-	FLOAT *x_ptr;
-	FLOAT *y_ptr;
-	FLOAT *ap[4];
-	BLASLONG n1;
-	BLASLONG m1;
-	BLASLONG m2;
-	BLASLONG m3;
-	BLASLONG n2;
-	BLASLONG lda4 =  lda << 2;
-	BLASLONG lda8 =  lda << 3;
-	FLOAT xbuffer[8] __attribute__((aligned(16)));
-	FLOAT *ybuffer;
-
-        if ( m < 1 ) return(0);
-        if ( n < 1 ) return(0);
-
-	ybuffer = buffer;
-	
-        if ( inc_x == 1 )
-	{
-		n1 = n >> 3 ;
-		n2 = n &  7 ;
-	}
-	else
-	{
-		n1 = n >> 2 ;
-		n2 = n &  3 ;
-
-	}
-	
-        m3 = m & 3  ;
-        m1 = m & -4 ;
-        m2 = (m & (NBMAX-1)) - m3 ;
-
-
-	y_ptr = y;
-
-	BLASLONG NB = NBMAX;
-
-	while ( NB == NBMAX )
-	{
-		
-		m1 -= NB;
-		if ( m1 < 0)
-		{
-			if ( m2 == 0 ) break;	
-			NB = m2;
-		}
-		
-		a_ptr = a;
-		x_ptr = x;
-		
-		ap[0] = a_ptr;
-		ap[1] = a_ptr + lda;
-		ap[2] = ap[1] + lda;
-		ap[3] = ap[2] + lda;
-
-		if ( inc_y != 1 )
-			memset(ybuffer,0,NB*4);
-		else
-			ybuffer = y_ptr;
-
-		if ( inc_x == 1 )
-		{
-
-
-			for( i = 0; i < n1 ; i++)
-			{
-				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
-				ap[0] += lda8; 
-				ap[1] += lda8; 
-				ap[2] += lda8; 
-				ap[3] += lda8; 
-				a_ptr += lda8;
-				x_ptr += 8;	
-			}
-
-
-			if ( n2 & 4 )
-			{
-				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
-				ap[0] += lda4; 
-				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
-				a_ptr += lda4;
-				x_ptr += 4;	
-			}
-
-			if ( n2 & 2 )
-			{
-				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
-				a_ptr += lda*2;
-				x_ptr += 2;	
-			}
-
-
-			if ( n2 & 1 )
-			{
-				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
-                a_ptr += lda;
-                x_ptr += 1;   
-			}
-
-
-		}
-		else
-		{
-
-			for( i = 0; i < n1 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[1] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[2] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[3] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
-				ap[0] += lda4; 
-				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
-				a_ptr += lda4;
-			}
-
-			for( i = 0; i < n2 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
-				a_ptr += lda;
-
-			}
-
-		}
-
-		a     += NB;
-		if ( inc_y != 1 )
-		{
-			add_y(NB,ybuffer,y_ptr,inc_y);
-			y_ptr += NB * inc_y;
-		}
-		else
-			y_ptr += NB ;
-
-	}
-
-	if ( m3 == 0 ) return(0);
-
-	if ( m3 == 3 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp0 = 0.0;
-		FLOAT temp1 = 0.0;
-		FLOAT temp2 = 0.0;
-		if ( lda == 3 && inc_x ==1 )
-		{
-
-			for( i = 0; i < ( n & -4 ); i+=4 )
-			{
-
-				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
-				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-
-				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
-				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
-				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
-
-				a_ptr += 12;
-				x_ptr += 4;
-			}
-
-			for( ; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				temp2 += a_ptr[2] * x_ptr[0];
-				a_ptr += 3;
-				x_ptr ++;
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				temp2 += a_ptr[2] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-
-
-			}
-
-		}
-		y_ptr[0] += alpha * temp0;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp1;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp2;
-		return(0);
-	}
-
-
-	if ( m3 == 2 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp0 = 0.0;
-		FLOAT temp1 = 0.0;
-		if ( lda == 2 && inc_x ==1 )
-		{
-
-			for( i = 0; i < (n & -4) ; i+=4 )
-			{
-				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
-				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
-				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
-				a_ptr += 8;
-				x_ptr += 4;
-
-			}
-
-
-			for( ; i < n; i++ )
-			{
-				temp0 += a_ptr[0]   * x_ptr[0];
-				temp1 += a_ptr[1]   * x_ptr[0];
-				a_ptr += 2;
-				x_ptr ++;
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-
-
-			}
-
-		}
-		y_ptr[0] += alpha * temp0;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp1;
-		return(0);
-	}
-
-	if ( m3 == 1 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp = 0.0;
-		if ( lda == 1 && inc_x ==1 )
-		{
-
-			for( i = 0; i < (n & -4); i+=4 )
-			{
-				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
-	
-			}
-
-			for( ; i < n; i++ )
-			{
-				temp += a_ptr[i] * x_ptr[i];
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp += a_ptr[0] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-			}
-
-		}
-		y_ptr[0] += alpha * temp;
-		return(0);
-	}
-
-
-	return(0);
-}
-
-#endif
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/gemv_n.c"
+
+#else
+
+#include "common.h"
+
+#define NBMAX 4096
+
+static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+    BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
+    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3]; 
+    b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    x4 = xo[4] * *alpha;
+    x5 = xo[5] * *alpha;
+    x6 = xo[6] * *alpha;
+    x7 = xo[7] * *alpha;
+    __vector float* va0 = (__vector float*)a0;
+    __vector float* va1 = (__vector float*)a1;
+    __vector float* va2 = (__vector float*)a2;
+    __vector float* va3 = (__vector float*)a3;
+    __vector float* vb0 = (__vector float*)b0;
+    __vector float* vb1 = (__vector float*)b1;
+    __vector float* vb2 = (__vector float*)b2;
+    __vector float* vb3 = (__vector float*)b3; 
+    
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float   v_x4 = {x4,x4,x4,x4};
+    __vector float   v_x5 = {x5,x5,x5,x5};
+    __vector float   v_x6 = {x6,x6,x6,x6};
+    __vector float   v_x7 = {x7,x7,x7,x7};
+    __vector float* v_y =(__vector float*)y;   
+ 
+    for ( i=0; i< n/4; i++)
+    {
+        register __vector float vy=v_y[i];
+        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ; 
+        vy  += v_x4 * vb0[i]   +  v_x5 * vb1[i]   + v_x6 * vb2[i]   + v_x7 * vb3[i] ;
+        v_y[i] =vy;  
+    }
+
+}
+	 
+static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+    BLASLONG i;
+    FLOAT x0,x1,x2,x3;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1];
+    __vector float* va2 = (__vector float*)ap[2];
+    __vector float* va3 = (__vector float*)ap[3]; 
+ 
+    for ( i=0; i< n/4; i++ )
+    {
+        register __vector float vy=v_y[i];
+        vy   += v_x0 * va0[i]   +  v_x1 * va1[i]   + v_x2 * va2[i]   + v_x3 * va3[i] ;  
+        v_y[i] =vy;     
+    }
+
+} 
+
+static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0,x1;
+    x0 = x[0] * *alpha;
+    x1 = x[1] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1]; 
+ 
+    for ( i=0; i< n/4; i++ )
+    { 
+        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;     
+    }
+
+} 
+ 
+ 
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0 ;
+    x0 = x[0] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap; 
+ 
+    for ( i=0; i< n/4; i++ )
+    { 
+        v_y[i]   += v_x0 * va0[i]  ;        
+    }
+
+}
+ 
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+    BLASLONG i;
+        
+    for ( i=0; i<n; i++ ){
+            *dest += *src;
+            src++;
+            dest += inc_dest;
+    }
+    return;
+     
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
+	FLOAT xbuffer[8] __attribute__((aligned(16)));
+	FLOAT *ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
+	
+        m3 = m & 3  ;
+        m1 = m & -4 ;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*4);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+
+			if ( n2 & 4 )
+			{
+				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
+                a_ptr += lda;
+                x_ptr += 1;   
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	if ( m3 == 0 ) return(0);
+
+	if ( m3 == 3 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		if ( lda == 3 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+
+				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
+				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
+				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];
+
+				a_ptr += 12;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += 3;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		return(0);
+	}
+
+
+	if ( m3 == 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		return(0);
+	}
+
+	if ( m3 == 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+		return(0);
+	}
+
+
+	return(0);
+}
+
+#endif
+
diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c
index 64696236a..0edb79129 100644
--- a/kernel/power/sgemv_n_8.c
+++ b/kernel/power/sgemv_n_8.c
@@ -1,514 +1,514 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-/****Note***
-UnUsed kernel
-This kernel works. But it was not competitive enough to be added in production
-It could be used and tested in future or could provide barebone for switching to inline assembly
-*/
-
-#include "common.h"
-
-#define NBMAX 4096
-
-static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
-{
-
-    BLASLONG i;
-	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
-    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
-	a0 = ap[0];
-	a1 = ap[1];
-	a2 = ap[2];
-	a3 = ap[3]; 
-    b0 = a0 + lda4 ;
-	b1 = a1 + lda4 ;
-	b2 = a2 + lda4 ;
-	b3 = a3 + lda4 ;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    x4 = xo[4] * *alpha;
-    x5 = xo[5] * *alpha;
-    x6 = xo[6] * *alpha;
-    x7 = xo[7] * *alpha;
-    __vector float* va0 = (__vector float*)a0;
-    __vector float* va1 = (__vector float*)a1;
-    __vector float* va2 = (__vector float*)a2;
-    __vector float* va3 = (__vector float*)a3;
-    __vector float* vb0 = (__vector float*)b0;
-    __vector float* vb1 = (__vector float*)b1;
-    __vector float* vb2 = (__vector float*)b2;
-    __vector float* vb3 = (__vector float*)b3; 
-    
-    register __vector float   v_x0 = {x0,x0,x0,x0};
-    register __vector float   v_x1 = {x1,x1,x1,x1};
-    register __vector float   v_x2 = {x2,x2,x2,x2};
-    register __vector float   v_x3 = {x3,x3,x3,x3};
-    register __vector float   v_x4 = {x4,x4,x4,x4};
-    register __vector float   v_x5 = {x5,x5,x5,x5};
-    register __vector float   v_x6 = {x6,x6,x6,x6};
-    register __vector float   v_x7 = {x7,x7,x7,x7};
-    __vector float* v_y =(__vector float*)y;   
- 
-    for ( i=0; i< n/4; i+=2)
-    {
-        register __vector float vy_1=v_y[i];
-        register __vector float vy_2=v_y[i+1];
-        register __vector float va0_1=va0[i] ; 
-        register __vector float va0_2=va0[i+1] ; 
-        register __vector float va1_1=va1[i] ; 
-        register __vector float va1_2=va1[i+1] ; 
-        register __vector float va2_1=va2[i] ; 
-        register __vector float va2_2=va2[i+1] ; 
-        register __vector float va3_1=va3[i] ; 
-        register __vector float va3_2=va3[i+1] ;
-        register __vector float vb0_1=vb0[i] ; 
-        register __vector float vb0_2=vb0[i+1] ; 
-        register __vector float vb1_1=vb1[i] ; 
-        register __vector float vb1_2=vb1[i+1] ; 
-        register __vector float vb2_1=vb2[i] ; 
-        register __vector float vb2_2=vb2[i+1] ; 
-        register __vector float vb3_1=vb3[i] ; 
-        register __vector float vb3_2=vb3[i+1] ;         
-        vy_1   += v_x0 * va0_1  +  v_x1 * va1_1  + v_x2 * va2_1  + v_x3 * va3_1 ;
-        vy_1   += v_x4 * vb0_1   +  v_x5 * vb1_1   + v_x6 * vb2_1   + v_x7 * vb3_1 ;
-        vy_2   +=  v_x0 * va0_2   +  v_x1 * va1_2   + v_x2 * va2_2   + v_x3 * va3_2 ; 
-        vy_2   += v_x4 * vb0_2   +  v_x5 * vb1_2   + v_x6 * vb2_2   + v_x7 * vb3_2 ;
-        v_y[i] =vy_1;
-        v_y[i+1] =vy_2;   
-    }
-
-}
-	 
-static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
-{
-    BLASLONG i;
-    FLOAT x0,x1,x2,x3;
-    x0 = xo[0] * *alpha;
-    x1 = xo[1] * *alpha;
-    x2 = xo[2] * *alpha;
-    x3 = xo[3] * *alpha;
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1};
-    __vector float   v_x2 = {x2,x2,x2,x2};
-    __vector float   v_x3 = {x3,x3,x3,x3};
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap[0];
-    __vector float* va1 = (__vector float*)ap[1];
-    __vector float* va2 = (__vector float*)ap[2];
-    __vector float* va3 = (__vector float*)ap[3]; 
- 
-    for ( i=0; i< n/4; i+=2 )
-    {
-        register __vector float vy_1=v_y[i];
-        register __vector float vy_2=v_y[i+1];
-        register __vector float va0_1=va0[i] ; 
-        register __vector float va0_2=va0[i+1] ; 
-        register __vector float va1_1=va1[i] ; 
-        register __vector float va1_2=va1[i+1] ; 
-        register __vector float va2_1=va2[i] ; 
-        register __vector float va2_2=va2[i+1] ; 
-        register __vector float va3_1=va3[i] ; 
-        register __vector float va3_2=va3[i+1] ;      
-        vy_1   += v_x0 * va0_1  +  v_x1 * va1_1  + v_x2 * va2_1  + v_x3 * va3_1 ;
-        vy_2   +=  v_x0 * va0_2   +  v_x1 * va1_2   + v_x2 * va2_2   + v_x3 * va3_2 ;
-        v_y[i] =vy_1;
-        v_y[i+1] =vy_2;   
-    }
-  
-} 
-
-static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
-    BLASLONG i;
-    FLOAT x0,x1;
-    x0 = x[0] * *alpha;
-    x1 = x[1] * *alpha; 
-    __vector float   v_x0 = {x0,x0,x0,x0};
-    __vector float   v_x1 = {x1,x1,x1,x1}; 
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap[0];
-    __vector float* va1 = (__vector float*)ap[1]; 
- 
-    for ( i=0; i< n/4; i+=2 )
-    { 
-        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;
-        v_y[i+1]  += v_x0 * va0[i+1]   +  v_x1 * va1[i+1] ;     
-    }
-
-} 
- 
- 
-static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
-{
-
-    BLASLONG i;
-    FLOAT x0 ;
-    x0 = x[0] * *alpha; 
-    __vector float   v_x0 = {x0,x0,x0,x0}; 
-    __vector float* v_y =(__vector float*)y;      
-    __vector float* va0 = (__vector float*)ap; 
- 
-    for ( i=0; i< n/4; i+=2 )
-    { 
-        v_y[i]   += v_x0 * va0[i]   ;
-        v_y[i+1] +=   v_x0 * va0[i+1]   ;        
-    }
-
-}
- 
-static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
-{
-    BLASLONG i;
-        
-    for ( i=0; i<n; i++ ){
-            *dest += *src;
-            src++;
-            dest += inc_dest;
-    }
-    return;
-     
-
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
-{
-	BLASLONG i;
-	FLOAT *a_ptr;
-	FLOAT *x_ptr;
-	FLOAT *y_ptr;
-	FLOAT *ap[4];
-	BLASLONG n1;
-	BLASLONG m1;
-	BLASLONG m2;
-	BLASLONG m3;
-	BLASLONG n2;
-	BLASLONG lda4 =  lda << 2;
-	BLASLONG lda8 =  lda << 3;
-	FLOAT xbuffer[8] __attribute__((aligned(16)));
-	FLOAT *ybuffer;
-
-        if ( m < 1 ) return(0);
-        if ( n < 1 ) return(0);
-
-	ybuffer = buffer;
-	
-        if ( inc_x == 1 )
-	{
-		n1 = n >> 3 ;
-		n2 = n &  7 ;
-	}
-	else
-	{
-		n1 = n >> 2 ;
-		n2 = n &  3 ;
-
-	}
-	 
-        m3 = m & 7  ;
-        m1 = m - m3;
-        m2 = (m & (NBMAX-1)) - m3 ;
-
-
-	y_ptr = y;
-
-	BLASLONG NB = NBMAX;
-
-	while ( NB == NBMAX )
-	{
-		
-		m1 -= NB;
-		if ( m1 < 0)
-		{
-			if ( m2 == 0 ) break;	
-			NB = m2;
-		}
-		
-		a_ptr = a;
-		x_ptr = x;
-		
-		ap[0] = a_ptr;
-		ap[1] = a_ptr + lda;
-		ap[2] = ap[1] + lda;
-		ap[3] = ap[2] + lda;
-
-		if ( inc_y != 1 )
-			memset(ybuffer,0,NB*4);
-		else
-			ybuffer = y_ptr;
-
-		if ( inc_x == 1 )
-		{
-
-
-			for( i = 0; i < n1 ; i++)
-			{
-				sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
-				ap[0] += lda8; 
-				ap[1] += lda8; 
-				ap[2] += lda8; 
-				ap[3] += lda8; 
-				a_ptr += lda8;
-				x_ptr += 8;	
-			}
-
-
-			if ( n2 & 4 )
-			{
-				sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha);
-				ap[0] += lda4; 
-				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
-				a_ptr += lda4;
-				x_ptr += 4;	
-			}
-
-			if ( n2 & 2 )
-			{
-				sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha);
-				a_ptr += lda*2;
-				x_ptr += 2;	
-			}
-
-
-			if ( n2 & 1 )
-			{
-				sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
-                a_ptr += lda;
-                x_ptr += 1;   
-			}
-
-
-		}
-		else
-		{
-
-			for( i = 0; i < n1 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[1] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[2] =  x_ptr[0];
-				x_ptr += inc_x;	
-				xbuffer[3] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha);
-				ap[0] += lda4; 
-				ap[1] += lda4; 
-				ap[2] += lda4; 
-				ap[3] += lda4; 
-				a_ptr += lda4;
-			}
-
-			for( i = 0; i < n2 ; i++)
-			{
-				xbuffer[0] = x_ptr[0];
-				x_ptr += inc_x;	
-				sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
-				a_ptr += lda;
-
-			}
-
-		}
-
-		a     += NB;
-		if ( inc_y != 1 )
-		{
-			add_y(NB,ybuffer,y_ptr,inc_y);
-			y_ptr += NB * inc_y;
-		}
-		else
-			y_ptr += NB ;
-
-	}
-
-	 
-	if ( m3 & 4 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp0 = 0.0;
-		FLOAT temp1 = 0.0;
-		FLOAT temp2 = 0.0;
-		FLOAT temp3 = 0.0;		
-		if ( lda == 4 && inc_x ==1 )
-		{
-
-			for( i = 0; i < ( n & -4 ); i+=4 )
-			{
-
-				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1];
-				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1];
-				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1];
-				temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1];
-
-				temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12]  * x_ptr[3];
-				temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3];
-				temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3];
-				temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3];
-
-				a_ptr += 16;
-				x_ptr += 4;
-			}
-
-			for( ; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				temp2 += a_ptr[2] * x_ptr[0];
-				temp3 += a_ptr[3] * x_ptr[0] ;
-				a_ptr +=4;
-				x_ptr ++;
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				temp2 += a_ptr[2] * x_ptr[0];
-				temp3 += a_ptr[3] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-
-
-			}
-
-		}
-		y_ptr[0] += alpha * temp0;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp1;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp2;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp3; 
-		y_ptr += inc_y;
-        a     += 4;
-	}
-
-
-	if ( m3 & 2 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp0 = 0.0;
-		FLOAT temp1 = 0.0;
-		if ( lda == 2 && inc_x ==1 )
-		{
-
-			for( i = 0; i < (n & -4) ; i+=4 )
-			{
-				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
-				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
-				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
-				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
-				a_ptr += 8;
-				x_ptr += 4;
-
-			}
-
-
-			for( ; i < n; i++ )
-			{
-				temp0 += a_ptr[0]   * x_ptr[0];
-				temp1 += a_ptr[1]   * x_ptr[0];
-				a_ptr += 2;
-				x_ptr ++;
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp0 += a_ptr[0] * x_ptr[0];
-				temp1 += a_ptr[1] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-
-
-			}
-
-		}
-		y_ptr[0] += alpha * temp0;
-		y_ptr += inc_y;
-		y_ptr[0] += alpha * temp1;
- 		y_ptr += inc_y;
-        a     += 2;
-	}
-
-	if ( m3 & 1 )
-	{
-		a_ptr = a;
-		x_ptr = x;
-		FLOAT temp = 0.0;
-		if ( lda == 1 && inc_x ==1 )
-		{
-
-			for( i = 0; i < (n & -4); i+=4 )
-			{
-				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
-	
-			}
-
-			for( ; i < n; i++ )
-			{
-				temp += a_ptr[i] * x_ptr[i];
-			}
-
-		}
-		else
-		{
-
-			for( i = 0; i < n; i++ )
-			{
-				temp += a_ptr[0] * x_ptr[0];
-				a_ptr += lda;
-				x_ptr += inc_x;
-			}
-
-		}
-		y_ptr[0] += alpha * temp;
- 
- 
-	}
-
-
-	return(0);
-}
-
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/****Note***
+UnUsed kernel
+This kernel works. But it was not competitive enough to be added in production
+It could be used and tested in future or could provide barebone for switching to inline assembly
+*/
+
+#include "common.h"
+
+#define NBMAX 4096
+
+static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+    BLASLONG i;
+	FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; 
+    FLOAT x0,x1,x2,x3,x4,x5,x6,x7;
+	a0 = ap[0];
+	a1 = ap[1];
+	a2 = ap[2];
+	a3 = ap[3]; 
+    b0 = a0 + lda4 ;
+	b1 = a1 + lda4 ;
+	b2 = a2 + lda4 ;
+	b3 = a3 + lda4 ;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    x4 = xo[4] * *alpha;
+    x5 = xo[5] * *alpha;
+    x6 = xo[6] * *alpha;
+    x7 = xo[7] * *alpha;
+    __vector float* va0 = (__vector float*)a0;
+    __vector float* va1 = (__vector float*)a1;
+    __vector float* va2 = (__vector float*)a2;
+    __vector float* va3 = (__vector float*)a3;
+    __vector float* vb0 = (__vector float*)b0;
+    __vector float* vb1 = (__vector float*)b1;
+    __vector float* vb2 = (__vector float*)b2;
+    __vector float* vb3 = (__vector float*)b3; 
+    
+    register __vector float   v_x0 = {x0,x0,x0,x0};
+    register __vector float   v_x1 = {x1,x1,x1,x1};
+    register __vector float   v_x2 = {x2,x2,x2,x2};
+    register __vector float   v_x3 = {x3,x3,x3,x3};
+    register __vector float   v_x4 = {x4,x4,x4,x4};
+    register __vector float   v_x5 = {x5,x5,x5,x5};
+    register __vector float   v_x6 = {x6,x6,x6,x6};
+    register __vector float   v_x7 = {x7,x7,x7,x7};
+    __vector float* v_y =(__vector float*)y;   
+ 
+    for ( i=0; i< n/4; i+=2)
+    {
+        register __vector float vy_1=v_y[i];
+        register __vector float vy_2=v_y[i+1];
+        register __vector float va0_1=va0[i] ; 
+        register __vector float va0_2=va0[i+1] ; 
+        register __vector float va1_1=va1[i] ; 
+        register __vector float va1_2=va1[i+1] ; 
+        register __vector float va2_1=va2[i] ; 
+        register __vector float va2_2=va2[i+1] ; 
+        register __vector float va3_1=va3[i] ; 
+        register __vector float va3_2=va3[i+1] ;
+        register __vector float vb0_1=vb0[i] ; 
+        register __vector float vb0_2=vb0[i+1] ; 
+        register __vector float vb1_1=vb1[i] ; 
+        register __vector float vb1_2=vb1[i+1] ; 
+        register __vector float vb2_1=vb2[i] ; 
+        register __vector float vb2_2=vb2[i+1] ; 
+        register __vector float vb3_1=vb3[i] ; 
+        register __vector float vb3_2=vb3[i+1] ;         
+        vy_1   += v_x0 * va0_1  +  v_x1 * va1_1  + v_x2 * va2_1  + v_x3 * va3_1 ;
+        vy_1   += v_x4 * vb0_1   +  v_x5 * vb1_1   + v_x6 * vb2_1   + v_x7 * vb3_1 ;
+        vy_2   +=  v_x0 * va0_2   +  v_x1 * va1_2   + v_x2 * va2_2   + v_x3 * va3_2 ; 
+        vy_2   += v_x4 * vb0_2   +  v_x5 * vb1_2   + v_x6 * vb2_2   + v_x7 * vb3_2 ;
+        v_y[i] =vy_1;
+        v_y[i+1] =vy_2;   
+    }
+
+}
+	 
+static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+{
+    BLASLONG i;
+    FLOAT x0,x1,x2,x3;
+    x0 = xo[0] * *alpha;
+    x1 = xo[1] * *alpha;
+    x2 = xo[2] * *alpha;
+    x3 = xo[3] * *alpha;
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1};
+    __vector float   v_x2 = {x2,x2,x2,x2};
+    __vector float   v_x3 = {x3,x3,x3,x3};
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1];
+    __vector float* va2 = (__vector float*)ap[2];
+    __vector float* va3 = (__vector float*)ap[3]; 
+ 
+    for ( i=0; i< n/4; i+=2 )
+    {
+        register __vector float vy_1=v_y[i];
+        register __vector float vy_2=v_y[i+1];
+        register __vector float va0_1=va0[i] ; 
+        register __vector float va0_2=va0[i+1] ; 
+        register __vector float va1_1=va1[i] ; 
+        register __vector float va1_2=va1[i+1] ; 
+        register __vector float va2_1=va2[i] ; 
+        register __vector float va2_2=va2[i+1] ; 
+        register __vector float va3_1=va3[i] ; 
+        register __vector float va3_2=va3[i+1] ;      
+        vy_1   += v_x0 * va0_1  +  v_x1 * va1_1  + v_x2 * va2_1  + v_x3 * va3_1 ;
+        vy_2   +=  v_x0 * va0_2   +  v_x1 * va1_2   + v_x2 * va2_2   + v_x3 * va3_2 ;
+        v_y[i] =vy_1;
+        v_y[i+1] =vy_2;   
+    }
+  
+} 
+
+static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0,x1;
+    x0 = x[0] * *alpha;
+    x1 = x[1] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0};
+    __vector float   v_x1 = {x1,x1,x1,x1}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap[0];
+    __vector float* va1 = (__vector float*)ap[1]; 
+ 
+    for ( i=0; i< n/4; i+=2 )
+    { 
+        v_y[i]   += v_x0 * va0[i]   +  v_x1 * va1[i] ;
+        v_y[i+1]  += v_x0 * va0[i+1]   +  v_x1 * va1[i+1] ;     
+    }
+
+} 
+ 
+ 
+static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+    BLASLONG i;
+    FLOAT x0 ;
+    x0 = x[0] * *alpha; 
+    __vector float   v_x0 = {x0,x0,x0,x0}; 
+    __vector float* v_y =(__vector float*)y;      
+    __vector float* va0 = (__vector float*)ap; 
+ 
+    for ( i=0; i< n/4; i+=2 )
+    { 
+        v_y[i]   += v_x0 * va0[i]   ;
+        v_y[i+1] +=   v_x0 * va0[i+1]   ;        
+    }
+
+}
+ 
+static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
+{
+    BLASLONG i;
+        
+    for ( i=0; i<n; i++ ){
+            *dest += *src;
+            src++;
+            dest += inc_dest;
+    }
+    return;
+     
+
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	FLOAT *a_ptr;
+	FLOAT *x_ptr;
+	FLOAT *y_ptr;
+	FLOAT *ap[4];
+	BLASLONG n1;
+	BLASLONG m1;
+	BLASLONG m2;
+	BLASLONG m3;
+	BLASLONG n2;
+	BLASLONG lda4 =  lda << 2;
+	BLASLONG lda8 =  lda << 3;
+	FLOAT xbuffer[8] __attribute__((aligned(16)));
+	FLOAT *ybuffer;
+
+        if ( m < 1 ) return(0);
+        if ( n < 1 ) return(0);
+
+	ybuffer = buffer;
+	
+        if ( inc_x == 1 )
+	{
+		n1 = n >> 3 ;
+		n2 = n &  7 ;
+	}
+	else
+	{
+		n1 = n >> 2 ;
+		n2 = n &  3 ;
+
+	}
+	 
+        m3 = m & 7  ;
+        m1 = m - m3;
+        m2 = (m & (NBMAX-1)) - m3 ;
+
+
+	y_ptr = y;
+
+	BLASLONG NB = NBMAX;
+
+	while ( NB == NBMAX )
+	{
+		
+		m1 -= NB;
+		if ( m1 < 0)
+		{
+			if ( m2 == 0 ) break;	
+			NB = m2;
+		}
+		
+		a_ptr = a;
+		x_ptr = x;
+		
+		ap[0] = a_ptr;
+		ap[1] = a_ptr + lda;
+		ap[2] = ap[1] + lda;
+		ap[3] = ap[2] + lda;
+
+		if ( inc_y != 1 )
+			memset(ybuffer,0,NB*4);
+		else
+			ybuffer = y_ptr;
+
+		if ( inc_x == 1 )
+		{
+
+
+			for( i = 0; i < n1 ; i++)
+			{
+				sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
+				ap[0] += lda8; 
+				ap[1] += lda8; 
+				ap[2] += lda8; 
+				ap[3] += lda8; 
+				a_ptr += lda8;
+				x_ptr += 8;	
+			}
+
+
+			if ( n2 & 4 )
+			{
+				sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+				x_ptr += 4;	
+			}
+
+			if ( n2 & 2 )
+			{
+				sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha);
+				a_ptr += lda*2;
+				x_ptr += 2;	
+			}
+
+
+			if ( n2 & 1 )
+			{
+				sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); 
+                a_ptr += lda;
+                x_ptr += 1;   
+			}
+
+
+		}
+		else
+		{
+
+			for( i = 0; i < n1 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[1] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[2] =  x_ptr[0];
+				x_ptr += inc_x;	
+				xbuffer[3] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha);
+				ap[0] += lda4; 
+				ap[1] += lda4; 
+				ap[2] += lda4; 
+				ap[3] += lda4; 
+				a_ptr += lda4;
+			}
+
+			for( i = 0; i < n2 ; i++)
+			{
+				xbuffer[0] = x_ptr[0];
+				x_ptr += inc_x;	
+				sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
+				a_ptr += lda;
+
+			}
+
+		}
+
+		a     += NB;
+		if ( inc_y != 1 )
+		{
+			add_y(NB,ybuffer,y_ptr,inc_y);
+			y_ptr += NB * inc_y;
+		}
+		else
+			y_ptr += NB ;
+
+	}
+
+	 
+	if ( m3 & 4 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		FLOAT temp2 = 0.0;
+		FLOAT temp3 = 0.0;		
+		if ( lda == 4 && inc_x ==1 )
+		{
+
+			for( i = 0; i < ( n & -4 ); i+=4 )
+			{
+
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1];
+				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1];
+				temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1];
+
+				temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12]  * x_ptr[3];
+				temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3];
+				temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3];
+				temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3];
+
+				a_ptr += 16;
+				x_ptr += 4;
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				temp3 += a_ptr[3] * x_ptr[0] ;
+				a_ptr +=4;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				temp2 += a_ptr[2] * x_ptr[0];
+				temp3 += a_ptr[3] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp2;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp3; 
+		y_ptr += inc_y;
+        a     += 4;
+	}
+
+
+	if ( m3 & 2 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp0 = 0.0;
+		FLOAT temp1 = 0.0;
+		if ( lda == 2 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4) ; i+=4 )
+			{
+				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
+				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
+				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
+				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
+				a_ptr += 8;
+				x_ptr += 4;
+
+			}
+
+
+			for( ; i < n; i++ )
+			{
+				temp0 += a_ptr[0]   * x_ptr[0];
+				temp1 += a_ptr[1]   * x_ptr[0];
+				a_ptr += 2;
+				x_ptr ++;
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp0 += a_ptr[0] * x_ptr[0];
+				temp1 += a_ptr[1] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+
+
+			}
+
+		}
+		y_ptr[0] += alpha * temp0;
+		y_ptr += inc_y;
+		y_ptr[0] += alpha * temp1;
+ 		y_ptr += inc_y;
+        a     += 2;
+	}
+
+	if ( m3 & 1 )
+	{
+		a_ptr = a;
+		x_ptr = x;
+		FLOAT temp = 0.0;
+		if ( lda == 1 && inc_x ==1 )
+		{
+
+			for( i = 0; i < (n & -4); i+=4 )
+			{
+				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
+	
+			}
+
+			for( ; i < n; i++ )
+			{
+				temp += a_ptr[i] * x_ptr[i];
+			}
+
+		}
+		else
+		{
+
+			for( i = 0; i < n; i++ )
+			{
+				temp += a_ptr[0] * x_ptr[0];
+				a_ptr += lda;
+				x_ptr += inc_x;
+			}
+
+		}
+		y_ptr[0] += alpha * temp;
+ 
+ 
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c
index 62c517a9d..c3fc8e77a 100644
--- a/kernel/power/sgemv_t.c
+++ b/kernel/power/sgemv_t.c
@@ -1,484 +1,484 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-#if !defined(__VEC__) || !defined(__ALTIVEC__)
-#include "../arm/gemv_t.c"
-
-#else
-
-#include "common.h"
-
-#define NBMAX 2048
-
-#include <altivec.h> 
- 
-static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-    BLASLONG i;  
-    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
-    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
-    register __vector float temp0 = {0,0,0,0};
-    register __vector float temp1 = {0,0,0,0};
-    register __vector float temp2 = {0,0,0,0};
-    register __vector float temp3 = {0,0,0,0};
-    register __vector float temp4 = {0,0,0,0};
-    register __vector float temp5 = {0,0,0,0};
-    register __vector float temp6 = {0,0,0,0};
-    register __vector float temp7 = {0,0,0,0};
-
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    a4 = a3 + lda;
-    a5 = a4 + lda;
-    a6 = a5 + lda;
-    a7 = a6 + lda;
-    va0 = (__vector float*) a0;
-    va1 = (__vector float*) a1;
-    va2 = (__vector float*) a2;
-    va3 = (__vector float*) a3;
-    va4 = (__vector float*) a4;
-    va5 = (__vector float*) a5;
-    va6 = (__vector float*) a6;
-    va7 = (__vector float*) a7;
-    v_x = (__vector float*) x;
- 
-   
-        for (i = 0; i < n/4; i ++) {
-            temp0 += v_x[i] * va0[i];
-            temp1 += v_x[i] * va1[i];
-            temp2 += v_x[i] * va2[i];
-            temp3 += v_x[i] * va3[i];
-            temp4 += v_x[i] * va4[i];
-            temp5 += v_x[i] * va5[i];
-            temp6 += v_x[i] * va6[i];
-            temp7 += v_x[i] * va7[i]; 
-        }
-    
-  
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
-    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
-    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
-
-    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
-    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
-    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
-    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
-
-}
- 
-
-static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-    BLASLONG i = 0;
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* va2 = (__vector float*) a2;
-    __vector float* va3 = (__vector float*) a3;
-    __vector float* v_x = (__vector float*) x;
-    register __vector float temp0 = {0,0,0,0};
-    register __vector float temp1 = {0,0,0,0};
-    register __vector float temp2 = {0,0,0,0};
-    register __vector float temp3 = {0,0,0,0}; 
-
-    for (i = 0; i < n / 4; i ++) {
-        temp0 += v_x[i] * va0[i];
-        temp1 += v_x[i] * va1[i];
-        temp2 += v_x[i] * va2[i];
-        temp3 += v_x[i] * va3[i]; 
-    }
- 
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
-    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
-    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
-
-}
- 
-
-static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
-
-    BLASLONG i;
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* v_x = (__vector float*) x;
-    __vector float temp0 = {0,0,0,0};
-    __vector float temp1 = {0,0,0,0};
-    for (i = 0; i < n / 4; i ++) {
-        temp0 += v_x[i] * va0[i];
-        temp1 += v_x[i] * va1[i];
-    }
-
-
-
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
-}
-
-static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-
-    BLASLONG i;
-    FLOAT *a0;
-    a0 = ap;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* v_x = (__vector float*) x;
-    __vector float temp0 = {0,0,0,0};
-    for (i = 0; i < n / 4; i ++) {
-        temp0 += v_x[i] * va0[i] ;
-    }
-
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-
-}
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
-    BLASLONG i;
-    for (i = 0; i < n; i++) {
-        *dest++ = *src;
-        src += inc_src;
-    }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
-    BLASLONG i;
-    BLASLONG j;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2; 
-    FLOAT ybuffer[8] __attribute__((aligned(16)));
-    FLOAT *xbuffer; 
-    if (m < 1) return (0);
-    if (n < 1) return (0);
-
-    xbuffer = buffer;
-
-    n1 = n >> 3;
-    n2 = n & 7;
-
-    m3 = m & 3;
-    m1 = m - m3;
-    m2 = (m & (NBMAX - 1)) - m3;
-
-    BLASLONG NB = NBMAX;
-
-    while (NB == NBMAX) {
-
-        m1 -= NB;
-        if (m1 < 0) {
-            if (m2 == 0) break;
-            NB = m2;
-        }
-
-        y_ptr = y;
-        a_ptr = a;
-        x_ptr = x;
-
-        if (inc_x != 1)
-            copy_x(NB, x_ptr, xbuffer, inc_x);
-        else
-            xbuffer = x_ptr;
-
-        BLASLONG lda8 = lda << 3;
-
-
-        if (inc_y == 1) {
-
-            for (i = 0; i < n1; i++) {
-                 
-                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
- 
-                y_ptr += 8;
-                a_ptr += lda8;
-        
-            }
-
-        } else {
-                   
-            for (i = 0; i < n1; i++) {
-                ybuffer[0] = 0;
-                ybuffer[1] = 0;
-                ybuffer[2] = 0;
-                ybuffer[3] = 0;
-                ybuffer[4] = 0;
-                ybuffer[5] = 0;
-                ybuffer[6] = 0;
-                ybuffer[7] = 0;
-                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
-
- 
-
-                *y_ptr += ybuffer[0];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[1];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[2];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[3];
-                y_ptr += inc_y;
-
-                *y_ptr += ybuffer[4];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[5];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[6];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[7];
-                y_ptr += inc_y;
-
-                a_ptr += lda8;
-            }
-
-        }
-
-
-        if (n2 & 4) {
-            ybuffer[0] = 0;
-            ybuffer[1] = 0;
-            ybuffer[2] = 0;
-            ybuffer[3] = 0;
-            sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
-
-            a_ptr += lda<<2;
-
-            *y_ptr += ybuffer[0];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[1];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[2];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[3];
-            y_ptr += inc_y;
-        }
-
-        if (n2 & 2) {
-            sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
-            a_ptr += lda << 1;
-            y_ptr += 2 * inc_y;
-
-        }
-
-        if (n2 & 1) {
-            sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
-            a_ptr += lda;
-            y_ptr += inc_y;
-
-        }
-
-        a += NB;
-        x += NB * inc_x;
-
-
-    }
-
-    if (m3 == 0) return (0);
-
-    x_ptr = x;
-    a_ptr = a;
-    if (m3 == 3) {
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp2 = *x_ptr * alpha;
-
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-
-        if (lda == 3 && inc_y == 1) {
-
-            for (j = 0; j < (n & -4); j += 4) {
-
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
-                y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
-                y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
-                y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
-                aj += 12;
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
-                aj += 3;
-            }
-
-        } else {
-
-            if (inc_y == 1) {
-
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for (j = 0; j < (n & -4); j += 4) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
-                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
-                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
-                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
-                    aj += lda4;
-                }
-
-                for (; j < n; j++) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
-                    aj += lda;
-                }
-
-            } else {
-
-                for (j = 0; j < n; j++) {
-                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
-                    y_ptr += inc_y;
-                    aj += lda;
-                }
-
-            }
-
-        }
-        return (0);
-    }
-
-    if (m3 == 2) {
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-
-        if (lda == 2 && inc_y == 1) {
-
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
-                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
-                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
-                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
-                aj += 8;
-
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
-                aj += 2;
-            }
-
-        } else {
-            if (inc_y == 1) {
-
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for (j = 0; j < (n & -4); j += 4) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
-                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
-                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
-                    aj += lda4;
-                }
-
-                for (; j < n; j++) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    aj += lda;
-                }
-
-            } else {
-                for (j = 0; j < n; j++) {
-                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    y_ptr += inc_y;
-                    aj += lda;
-                }
-            }
-
-        }
-        return (0);
-
-    }
-
-    FLOAT xtemp = *x_ptr * alpha;
-    FLOAT *aj = a_ptr;
-    y_ptr = y;
-    if (lda == 1 && inc_y == 1) {
-        for (j = 0; j < (n & -4); j += 4) {
-            y_ptr[j] += aj[j] * xtemp;
-            y_ptr[j + 1] += aj[j + 1] * xtemp;
-            y_ptr[j + 2] += aj[j + 2] * xtemp;
-            y_ptr[j + 3] += aj[j + 3] * xtemp;
-        }
-        for (; j < n; j++) {
-            y_ptr[j] += aj[j] * xtemp;
-        }
-
-
-    } else {
-        if (inc_y == 1) {
-
-            BLASLONG register lda2 = lda << 1;
-            BLASLONG register lda4 = lda << 2;
-            BLASLONG register lda3 = lda2 + lda;
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += *aj * xtemp;
-                y_ptr[j + 1] += *(aj + lda) * xtemp;
-                y_ptr[j + 2] += *(aj + lda2) * xtemp;
-                y_ptr[j + 3] += *(aj + lda3) * xtemp;
-                aj += lda4;
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += *aj * xtemp;
-                aj += lda;
-            }
-
-        } else {
-            for (j = 0; j < n; j++) {
-                *y_ptr += *aj * xtemp;
-                y_ptr += inc_y;
-                aj += lda;
-            }
-
-        }
-    }
-
-    return (0);
-
-}
-
-#endif
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+#if !defined(__VEC__) || !defined(__ALTIVEC__)
+#include "../arm/gemv_t.c"
+
+#else
+
+#include "common.h"
+
+#define NBMAX 2048
+
+#include <altivec.h> 
+ 
+static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i;  
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0};
+    register __vector float temp4 = {0,0,0,0};
+    register __vector float temp5 = {0,0,0,0};
+    register __vector float temp6 = {0,0,0,0};
+    register __vector float temp7 = {0,0,0,0};
+
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+    va0 = (__vector float*) a0;
+    va1 = (__vector float*) a1;
+    va2 = (__vector float*) a2;
+    va3 = (__vector float*) a3;
+    va4 = (__vector float*) a4;
+    va5 = (__vector float*) a5;
+    va6 = (__vector float*) a6;
+    va7 = (__vector float*) a7;
+    v_x = (__vector float*) x;
+ 
+   
+        for (i = 0; i < n/4; i ++) {
+            temp0 += v_x[i] * va0[i];
+            temp1 += v_x[i] * va1[i];
+            temp2 += v_x[i] * va2[i];
+            temp3 += v_x[i] * va3[i];
+            temp4 += v_x[i] * va4[i];
+            temp5 += v_x[i] * va5[i];
+            temp6 += v_x[i] * va6[i];
+            temp7 += v_x[i] * va7[i]; 
+        }
+    
+  
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
+    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
+    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
+    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
+
+}
+ 
+
+static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i = 0;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* va2 = (__vector float*) a2;
+    __vector float* va3 = (__vector float*) a3;
+    __vector float* v_x = (__vector float*) x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0}; 
+
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i];
+        temp1 += v_x[i] * va1[i];
+        temp2 += v_x[i] * va2[i];
+        temp3 += v_x[i] * va3[i]; 
+    }
+ 
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+}
+ 
+
+static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
+
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    __vector float temp1 = {0,0,0,0};
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i];
+        temp1 += v_x[i] * va1[i];
+    }
+
+
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
+}
+
+static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    for (i = 0; i < n / 4; i ++) {
+        temp0 += v_x[i] * va0[i] ;
+    }
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+
+}
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest++ = *src;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2; 
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer; 
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    xbuffer = buffer;
+
+    n1 = n >> 3;
+    n2 = n & 7;
+
+    m3 = m & 3;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 1)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        BLASLONG lda8 = lda << 3;
+
+
+        if (inc_y == 1) {
+
+            for (i = 0; i < n1; i++) {
+                 
+                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
+ 
+                y_ptr += 8;
+                a_ptr += lda8;
+        
+            }
+
+        } else {
+                   
+            for (i = 0; i < n1; i++) {
+                ybuffer[0] = 0;
+                ybuffer[1] = 0;
+                ybuffer[2] = 0;
+                ybuffer[3] = 0;
+                ybuffer[4] = 0;
+                ybuffer[5] = 0;
+                ybuffer[6] = 0;
+                ybuffer[7] = 0;
+                sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+ 
+
+                *y_ptr += ybuffer[0];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[1];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[2];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[3];
+                y_ptr += inc_y;
+
+                *y_ptr += ybuffer[4];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[5];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[6];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[7];
+                y_ptr += inc_y;
+
+                a_ptr += lda8;
+            }
+
+        }
+
+
+        if (n2 & 4) {
+            ybuffer[0] = 0;
+            ybuffer[1] = 0;
+            ybuffer[2] = 0;
+            ybuffer[3] = 0;
+            sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+            a_ptr += lda<<2;
+
+            *y_ptr += ybuffer[0];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[1];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[2];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[3];
+            y_ptr += inc_y;
+        }
+
+        if (n2 & 2) {
+            sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
+            a_ptr += lda << 1;
+            y_ptr += 2 * inc_y;
+
+        }
+
+        if (n2 & 1) {
+            sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+            a_ptr += lda;
+            y_ptr += inc_y;
+
+        }
+
+        a += NB;
+        x += NB * inc_x;
+
+
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    a_ptr = a;
+    if (m3 == 3) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp2 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 3 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
+                y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
+                y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
+                aj += 12;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
+                aj += 3;
+            }
+
+        } else {
+
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    aj += lda;
+                }
+
+            } else {
+
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+
+            }
+
+        }
+        return (0);
+    }
+
+    if (m3 == 2) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 2 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+                aj += 8;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                aj += 2;
+            }
+
+        } else {
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    aj += lda;
+                }
+
+            } else {
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+            }
+
+        }
+        return (0);
+
+    }
+
+    FLOAT xtemp = *x_ptr * alpha;
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+    if (lda == 1 && inc_y == 1) {
+        for (j = 0; j < (n & -4); j += 4) {
+            y_ptr[j] += aj[j] * xtemp;
+            y_ptr[j + 1] += aj[j + 1] * xtemp;
+            y_ptr[j + 2] += aj[j + 2] * xtemp;
+            y_ptr[j + 3] += aj[j + 3] * xtemp;
+        }
+        for (; j < n; j++) {
+            y_ptr[j] += aj[j] * xtemp;
+        }
+
+
+    } else {
+        if (inc_y == 1) {
+
+            BLASLONG register lda2 = lda << 1;
+            BLASLONG register lda4 = lda << 2;
+            BLASLONG register lda3 = lda2 + lda;
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += *aj * xtemp;
+                y_ptr[j + 1] += *(aj + lda) * xtemp;
+                y_ptr[j + 2] += *(aj + lda2) * xtemp;
+                y_ptr[j + 3] += *(aj + lda3) * xtemp;
+                aj += lda4;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += *aj * xtemp;
+                aj += lda;
+            }
+
+        } else {
+            for (j = 0; j < n; j++) {
+                *y_ptr += *aj * xtemp;
+                y_ptr += inc_y;
+                aj += lda;
+            }
+
+        }
+    }
+
+    return (0);
+
+}
+
+#endif
diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c
index b90512162..1ee7c8aeb 100644
--- a/kernel/power/sgemv_t_8.c
+++ b/kernel/power/sgemv_t_8.c
@@ -1,508 +1,508 @@
-/***************************************************************************
-Copyright (c) 2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
- 
-/****Note***
-UnUsed kernel
-This kernel works. But it was not competitive enough to be added in production
-It could be used and tested in future or could be used as base for switching to inline assembly
-*/
-
-#include "common.h"
-#include <stdio.h>
-#define NBMAX 4096
-
-#include <altivec.h> 
- 
-static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-    BLASLONG i;  
-    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
-    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
-    register __vector float temp0 = {0,0,0,0};
-    register __vector float temp1 = {0,0,0,0};
-    register __vector float temp2 = {0,0,0,0};
-    register __vector float temp3 = {0,0,0,0};
-    register __vector float temp4 = {0,0,0,0};
-    register __vector float temp5 = {0,0,0,0};
-    register __vector float temp6 = {0,0,0,0};
-    register __vector float temp7 = {0,0,0,0};
-
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    a4 = a3 + lda;
-    a5 = a4 + lda;
-    a6 = a5 + lda;
-    a7 = a6 + lda;
-    va0 = (__vector float*) a0;
-    va1 = (__vector float*) a1;
-    va2 = (__vector float*) a2;
-    va3 = (__vector float*) a3;
-    va4 = (__vector float*) a4;
-    va5 = (__vector float*) a5;
-    va6 = (__vector float*) a6;
-    va7 = (__vector float*) a7;
-    v_x = (__vector float*) x;
- 
-   
-        for (i = 0; i < n/4; i +=2) {
-            register __vector float vx1=v_x[i] ; 
-            register __vector float vx2=v_x[i+1] ; 
-            register __vector float va0_1=va0[i] ; 
-            register __vector float va0_2=va0[i+1] ; 
-            register __vector float va1_1=va1[i] ; 
-            register __vector float va1_2=va1[i+1] ; 
-            register __vector float va2_1=va2[i] ; 
-            register __vector float va2_2=va2[i+1] ; 
-            register __vector float va3_1=va3[i] ; 
-            register __vector float va3_2=va3[i+1] ; 
-            register __vector float va4_1=va4[i] ; 
-            register __vector float va4_2=va4[i+1] ;
-            register __vector float va5_1=va5[i] ; 
-            register __vector float va5_2=va5[i+1] ; 
-            register __vector float va6_1=va6[i] ; 
-            register __vector float va6_2=va6[i+1] ; 
-            register __vector float va7_1=va7[i] ; 
-            register __vector float va7_2=va7[i+1] ;                       
-            temp0 += vx1* va0_1 + vx2 * va0_2;
-            temp1 += vx1* va1_1 + vx2 * va1_2;
-            temp2 += vx1* va2_1 + vx2 * va2_2;
-            temp3 += vx1* va3_1 + vx2 * va3_2;
-            temp4 += vx1* va4_1 + vx2 * va4_2;
-            temp5 += vx1* va5_1 + vx2 * va5_2;
-            temp6 += vx1* va6_1 + vx2 * va6_2;
-            temp7 += vx1* va7_1 + vx2 * va7_2;  
-        }
-    
-  
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
-    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
-    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
-
-    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
-    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
-    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
-    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
-
-}
- 
-
-static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-    BLASLONG i = 0;
-    FLOAT *a0, *a1, *a2, *a3;
-    a0 = ap;
-    a1 = ap + lda;
-    a2 = a1 + lda;
-    a3 = a2 + lda;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* va2 = (__vector float*) a2;
-    __vector float* va3 = (__vector float*) a3;
-    __vector float* v_x = (__vector float*) x;
-    register __vector float temp0 = {0,0,0,0};
-    register __vector float temp1 = {0,0,0,0};
-    register __vector float temp2 = {0,0,0,0};
-    register __vector float temp3 = {0,0,0,0}; 
-
-    for (i = 0; i < n / 4; i +=2) {
-        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
-        temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1];
-        temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1];
-        temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; 
-    }
- 
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
-    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
-    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
-
-}
- 
-
-static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
-
-    BLASLONG i;
-    FLOAT *a0, *a1;
-    a0 = ap;
-    a1 = ap + lda;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* v_x = (__vector float*) x;
-    __vector float temp0 = {0,0,0,0};
-    __vector float temp1 = {0,0,0,0};
-    for (i = 0; i < n / 4; i +=2) {
-        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
-        temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; 
-    }
-
-
-
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
-}
-
-static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
-
-    BLASLONG i;
-    FLOAT *a0;
-    a0 = ap;
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* v_x = (__vector float*) x;
-    __vector float temp0 = {0,0,0,0};
-    for (i = 0; i < n / 4; i +=2) {
-        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; 
-    }
-    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
-
-}
- 
-
-static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
-    BLASLONG i;
-    for (i = 0; i < n; i++) {
-        *dest++ = *src;
-        src += inc_src;
-    }
-}
-
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
-    BLASLONG i;
-    BLASLONG j;
-    FLOAT *a_ptr;
-    FLOAT *x_ptr;
-    FLOAT *y_ptr;
-
-    BLASLONG n1;
-    BLASLONG m1;
-    BLASLONG m2;
-    BLASLONG m3;
-    BLASLONG n2;
-
-    FLOAT ybuffer[8] __attribute__((aligned(16)));
-    FLOAT *xbuffer; 
-    if (m < 1) return (0);
-    if (n < 1) return (0);
-
-    xbuffer = buffer;
-
-    n1 = n >> 3;
-    n2 = n & 7;
-
-    m3 = m & 7;
-    m1 = m - m3;
-    m2 = (m & (NBMAX - 1)) - m3;
-  
-    BLASLONG NB = NBMAX;
-
-    while (NB == NBMAX) {
-
-        m1 -= NB;
-        if (m1 < 0) {
-            if (m2 == 0) break;
-            NB = m2;
-        }
-
-        y_ptr = y;
-        a_ptr = a;
-        x_ptr = x;
-
-        if (inc_x != 1)
-            copy_x(NB, x_ptr, xbuffer, inc_x);
-        else
-            xbuffer = x_ptr;
-
-        BLASLONG lda8 = lda << 3;
-
-  
-        if (inc_y == 1) {
-
-            for (i = 0; i < n1; i++) {
-                 
-                sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
- 
-                y_ptr += 8;
-                a_ptr += lda8;
-        
-            }
-
-        } else {
-                   
-            for (i = 0; i < n1; i++) {
-                ybuffer[0] = 0;
-                ybuffer[1] = 0;
-                ybuffer[2] = 0;
-                ybuffer[3] = 0;
-                ybuffer[4] = 0;
-                ybuffer[5] = 0;
-                ybuffer[6] = 0;
-                ybuffer[7] = 0;
-                sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
-
- 
-
-                *y_ptr += ybuffer[0];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[1];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[2];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[3];
-                y_ptr += inc_y;
-
-                *y_ptr += ybuffer[4];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[5];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[6];
-                y_ptr += inc_y;
-                *y_ptr += ybuffer[7];
-                y_ptr += inc_y;
-
-                a_ptr += lda8;
-            }
-
-        }
-
-
-        if (n2 & 4) {
-            ybuffer[0] = 0;
-            ybuffer[1] = 0;
-            ybuffer[2] = 0;
-            ybuffer[3] = 0;
-            sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
-
-            a_ptr += lda<<2;
-
-            *y_ptr += ybuffer[0];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[1];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[2];
-            y_ptr += inc_y;
-            *y_ptr += ybuffer[3];
-            y_ptr += inc_y;
-        }
-
-        if (n2 & 2) {
-            sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
-            a_ptr += lda << 1;
-            y_ptr += 2 * inc_y;
-
-        }
-
-        if (n2 & 1) {
-            sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha);
-            a_ptr += lda;
-            y_ptr += inc_y;
-
-        }
-
-        a += NB;
-        x += NB * inc_x;
-
-
-    }
-
-    if (m3 == 0) return (0);
-
-    x_ptr = x;
-    a_ptr = a;
-    if (m3 & 4) {
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp2 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp3 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-        if (lda == 4 && inc_y == 1) {
-
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1  +  aj[2] * xtemp2 + aj[3] * xtemp3;
-                y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1  +  aj[6] * xtemp2 + aj[7] * xtemp3;
-                y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1  +  aj[10] * xtemp2 + aj[11] * xtemp3;
-                y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1  +  aj[14] * xtemp2 + aj[15] * xtemp3;
-                aj += 16;
-
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 +  aj[2] * xtemp2 + aj[3] * xtemp3;
-                aj += 4;
-            }
-
-        } else if (inc_y == 1) {
-        
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for (j = 0; j < (n & -4); j += 4) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3;
-                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3;
-                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2  + *(aj +  lda2 +3) * xtemp3;
-                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2  + *(aj +  lda3+3) * xtemp3;
-                    aj += lda4;
-                }
-
-                for (; j < n; j++) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3;
-                    aj += lda;
-                }
-
-        } else {
-
-                for (j = 0; j < n; j++) {
-                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3;
-                    y_ptr += inc_y;
-                    aj += lda;
-                }
-
-            } 
-            if (m3==4) return (0);
-            a_ptr += 4; 
-    }
-
-    if (m3 & 2 ) {
-  
-        FLOAT xtemp0 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT xtemp1 = *x_ptr * alpha;
-        x_ptr += inc_x;
-        FLOAT *aj = a_ptr;
-        y_ptr = y;
-
-        if (lda == 2 && inc_y == 1) {
-
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
-                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
-                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
-                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
-                aj += 8;
-
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
-                aj += 2;
-            }
-
-        } else {
-            if (inc_y == 1) {
-
-                BLASLONG register lda2 = lda << 1;
-                BLASLONG register lda4 = lda << 2;
-                BLASLONG register lda3 = lda2 + lda;
-
-                for (j = 0; j < (n & -4); j += 4) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
-                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
-                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
-                    aj += lda4;
-                }
-
-                for (; j < n; j++) {
-
-                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    aj += lda;
-                }
-
-            } else {
-                for (j = 0; j < n; j++) {
-                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
-                    y_ptr += inc_y;
-                    aj += lda;
-                }
-            }
-
-        } 
-        if (m3==2) return (0);
-        a_ptr += 2; 
-    }
-    if (m3 & 1) {
-          
-    FLOAT xtemp = *x_ptr * alpha;
-            x_ptr += inc_x;
-    FLOAT *aj = a_ptr;
-    y_ptr = y;
-    if (lda == 1 && inc_y == 1) {
-        for (j = 0; j < (n & -4); j += 4) {
-            y_ptr[j] += aj[j] * xtemp;
-            y_ptr[j + 1] += aj[j + 1] * xtemp;
-            y_ptr[j + 2] += aj[j + 2] * xtemp;
-            y_ptr[j + 3] += aj[j + 3] * xtemp;
-        }
-        for (; j < n; j++) {
-            y_ptr[j] += aj[j] * xtemp;
-        }
-
-
-    } else {
-        if (inc_y == 1) {
-
-            BLASLONG register lda2 = lda << 1;
-            BLASLONG register lda4 = lda << 2;
-            BLASLONG register lda3 = lda2 + lda;
-            for (j = 0; j < (n & -4); j += 4) {
-                y_ptr[j] += *aj * xtemp;
-                y_ptr[j + 1] += *(aj + lda) * xtemp;
-                y_ptr[j + 2] += *(aj + lda2) * xtemp;
-                y_ptr[j + 3] += *(aj + lda3) * xtemp;
-                aj += lda4;
-            }
-
-            for (; j < n; j++) {
-                y_ptr[j] += *aj * xtemp;
-                aj += lda;
-            }
-
-        } else {
-            for (j = 0; j < n; j++) {
-                *y_ptr += *aj * xtemp;
-                y_ptr += inc_y;
-                aj += lda;
-            }
-
-        }
-    
-    }
-                a_ptr += 1; 
-    }
-    return (0);
-
-}
-
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+ 
+/****Note***
+UnUsed kernel
+This kernel works. But it was not competitive enough to be added in production
+It could be used and tested in future or could be used as base for switching to inline assembly
+*/
+
+#include "common.h"
+#include <stdio.h>
+#define NBMAX 4096
+
+#include <altivec.h> 
+ 
+static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i;  
+    FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
+    __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0};
+    register __vector float temp4 = {0,0,0,0};
+    register __vector float temp5 = {0,0,0,0};
+    register __vector float temp6 = {0,0,0,0};
+    register __vector float temp7 = {0,0,0,0};
+
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    a4 = a3 + lda;
+    a5 = a4 + lda;
+    a6 = a5 + lda;
+    a7 = a6 + lda;
+    va0 = (__vector float*) a0;
+    va1 = (__vector float*) a1;
+    va2 = (__vector float*) a2;
+    va3 = (__vector float*) a3;
+    va4 = (__vector float*) a4;
+    va5 = (__vector float*) a5;
+    va6 = (__vector float*) a6;
+    va7 = (__vector float*) a7;
+    v_x = (__vector float*) x;
+ 
+   
+        for (i = 0; i < n/4; i +=2) {
+            register __vector float vx1=v_x[i] ; 
+            register __vector float vx2=v_x[i+1] ; 
+            register __vector float va0_1=va0[i] ; 
+            register __vector float va0_2=va0[i+1] ; 
+            register __vector float va1_1=va1[i] ; 
+            register __vector float va1_2=va1[i+1] ; 
+            register __vector float va2_1=va2[i] ; 
+            register __vector float va2_2=va2[i+1] ; 
+            register __vector float va3_1=va3[i] ; 
+            register __vector float va3_2=va3[i+1] ; 
+            register __vector float va4_1=va4[i] ; 
+            register __vector float va4_2=va4[i+1] ;
+            register __vector float va5_1=va5[i] ; 
+            register __vector float va5_2=va5[i+1] ; 
+            register __vector float va6_1=va6[i] ; 
+            register __vector float va6_2=va6[i+1] ; 
+            register __vector float va7_1=va7[i] ; 
+            register __vector float va7_2=va7[i+1] ;                       
+            temp0 += vx1* va0_1 + vx2 * va0_2;
+            temp1 += vx1* va1_1 + vx2 * va1_2;
+            temp2 += vx1* va2_1 + vx2 * va2_2;
+            temp3 += vx1* va3_1 + vx2 * va3_2;
+            temp4 += vx1* va4_1 + vx2 * va4_2;
+            temp5 += vx1* va5_1 + vx2 * va5_2;
+            temp6 += vx1* va6_1 + vx2 * va6_2;
+            temp7 += vx1* va7_1 + vx2 * va7_2;  
+        }
+    
+  
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+    y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]);
+    y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]);
+    y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]);
+    y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]);
+
+}
+ 
+
+static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+    BLASLONG i = 0;
+    FLOAT *a0, *a1, *a2, *a3;
+    a0 = ap;
+    a1 = ap + lda;
+    a2 = a1 + lda;
+    a3 = a2 + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* va2 = (__vector float*) a2;
+    __vector float* va3 = (__vector float*) a3;
+    __vector float* v_x = (__vector float*) x;
+    register __vector float temp0 = {0,0,0,0};
+    register __vector float temp1 = {0,0,0,0};
+    register __vector float temp2 = {0,0,0,0};
+    register __vector float temp3 = {0,0,0,0}; 
+
+    for (i = 0; i < n / 4; i +=2) {
+        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
+        temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1];
+        temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1];
+        temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; 
+    }
+ 
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]);
+    y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]);
+    y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]);
+
+}
+ 
+
+static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
+
+    BLASLONG i;
+    FLOAT *a0, *a1;
+    a0 = ap;
+    a1 = ap + lda;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* va1 = (__vector float*) a1;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    __vector float temp1 = {0,0,0,0};
+    for (i = 0; i < n / 4; i +=2) {
+        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1];
+        temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; 
+    }
+
+
+
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+    y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); 
+}
+
+static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
+
+    BLASLONG i;
+    FLOAT *a0;
+    a0 = ap;
+    __vector float* va0 = (__vector float*) a0;
+    __vector float* v_x = (__vector float*) x;
+    __vector float temp0 = {0,0,0,0};
+    for (i = 0; i < n / 4; i +=2) {
+        temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; 
+    }
+    y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]);
+
+}
+ 
+
+static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
+    BLASLONG i;
+    for (i = 0; i < n; i++) {
+        *dest++ = *src;
+        src += inc_src;
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
+    BLASLONG i;
+    BLASLONG j;
+    FLOAT *a_ptr;
+    FLOAT *x_ptr;
+    FLOAT *y_ptr;
+
+    BLASLONG n1;
+    BLASLONG m1;
+    BLASLONG m2;
+    BLASLONG m3;
+    BLASLONG n2;
+
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer; 
+    if (m < 1) return (0);
+    if (n < 1) return (0);
+
+    xbuffer = buffer;
+
+    n1 = n >> 3;
+    n2 = n & 7;
+
+    m3 = m & 7;
+    m1 = m - m3;
+    m2 = (m & (NBMAX - 1)) - m3;
+  
+    BLASLONG NB = NBMAX;
+
+    while (NB == NBMAX) {
+
+        m1 -= NB;
+        if (m1 < 0) {
+            if (m2 == 0) break;
+            NB = m2;
+        }
+
+        y_ptr = y;
+        a_ptr = a;
+        x_ptr = x;
+
+        if (inc_x != 1)
+            copy_x(NB, x_ptr, xbuffer, inc_x);
+        else
+            xbuffer = x_ptr;
+
+        BLASLONG lda8 = lda << 3;
+
+  
+        if (inc_y == 1) {
+
+            for (i = 0; i < n1; i++) {
+                 
+                sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
+ 
+                y_ptr += 8;
+                a_ptr += lda8;
+        
+            }
+
+        } else {
+                   
+            for (i = 0; i < n1; i++) {
+                ybuffer[0] = 0;
+                ybuffer[1] = 0;
+                ybuffer[2] = 0;
+                ybuffer[3] = 0;
+                ybuffer[4] = 0;
+                ybuffer[5] = 0;
+                ybuffer[6] = 0;
+                ybuffer[7] = 0;
+                sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+ 
+
+                *y_ptr += ybuffer[0];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[1];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[2];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[3];
+                y_ptr += inc_y;
+
+                *y_ptr += ybuffer[4];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[5];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[6];
+                y_ptr += inc_y;
+                *y_ptr += ybuffer[7];
+                y_ptr += inc_y;
+
+                a_ptr += lda8;
+            }
+
+        }
+
+
+        if (n2 & 4) {
+            ybuffer[0] = 0;
+            ybuffer[1] = 0;
+            ybuffer[2] = 0;
+            ybuffer[3] = 0;
+            sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
+
+            a_ptr += lda<<2;
+
+            *y_ptr += ybuffer[0];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[1];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[2];
+            y_ptr += inc_y;
+            *y_ptr += ybuffer[3];
+            y_ptr += inc_y;
+        }
+
+        if (n2 & 2) {
+            sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
+            a_ptr += lda << 1;
+            y_ptr += 2 * inc_y;
+
+        }
+
+        if (n2 & 1) {
+            sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha);
+            a_ptr += lda;
+            y_ptr += inc_y;
+
+        }
+
+        a += NB;
+        x += NB * inc_x;
+
+
+    }
+
+    if (m3 == 0) return (0);
+
+    x_ptr = x;
+    a_ptr = a;
+    if (m3 & 4) {
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp2 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp3 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+        if (lda == 4 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1  +  aj[2] * xtemp2 + aj[3] * xtemp3;
+                y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1  +  aj[6] * xtemp2 + aj[7] * xtemp3;
+                y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1  +  aj[10] * xtemp2 + aj[11] * xtemp3;
+                y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1  +  aj[14] * xtemp2 + aj[15] * xtemp3;
+                aj += 16;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 +  aj[2] * xtemp2 + aj[3] * xtemp3;
+                aj += 4;
+            }
+
+        } else if (inc_y == 1) {
+        
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2  + *(aj +  lda2 +3) * xtemp3;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2  + *(aj +  lda3+3) * xtemp3;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3;
+                    aj += lda;
+                }
+
+        } else {
+
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+
+            } 
+            if (m3==4) return (0);
+            a_ptr += 4; 
+    }
+
+    if (m3 & 2 ) {
+  
+        FLOAT xtemp0 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT xtemp1 = *x_ptr * alpha;
+        x_ptr += inc_x;
+        FLOAT *aj = a_ptr;
+        y_ptr = y;
+
+        if (lda == 2 && inc_y == 1) {
+
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
+                y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
+                y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
+                aj += 8;
+
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
+                aj += 2;
+            }
+
+        } else {
+            if (inc_y == 1) {
+
+                BLASLONG register lda2 = lda << 1;
+                BLASLONG register lda4 = lda << 2;
+                BLASLONG register lda3 = lda2 + lda;
+
+                for (j = 0; j < (n & -4); j += 4) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
+                    y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
+                    y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
+                    aj += lda4;
+                }
+
+                for (; j < n; j++) {
+
+                    y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    aj += lda;
+                }
+
+            } else {
+                for (j = 0; j < n; j++) {
+                    *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
+                    y_ptr += inc_y;
+                    aj += lda;
+                }
+            }
+
+        } 
+        if (m3==2) return (0);
+        a_ptr += 2; 
+    }
+    if (m3 & 1) {
+          
+    FLOAT xtemp = *x_ptr * alpha;
+            x_ptr += inc_x;
+    FLOAT *aj = a_ptr;
+    y_ptr = y;
+    if (lda == 1 && inc_y == 1) {
+        for (j = 0; j < (n & -4); j += 4) {
+            y_ptr[j] += aj[j] * xtemp;
+            y_ptr[j + 1] += aj[j + 1] * xtemp;
+            y_ptr[j + 2] += aj[j + 2] * xtemp;
+            y_ptr[j + 3] += aj[j + 3] * xtemp;
+        }
+        for (; j < n; j++) {
+            y_ptr[j] += aj[j] * xtemp;
+        }
+
+
+    } else {
+        if (inc_y == 1) {
+
+            BLASLONG register lda2 = lda << 1;
+            BLASLONG register lda4 = lda << 2;
+            BLASLONG register lda3 = lda2 + lda;
+            for (j = 0; j < (n & -4); j += 4) {
+                y_ptr[j] += *aj * xtemp;
+                y_ptr[j + 1] += *(aj + lda) * xtemp;
+                y_ptr[j + 2] += *(aj + lda2) * xtemp;
+                y_ptr[j + 3] += *(aj + lda3) * xtemp;
+                aj += lda4;
+            }
+
+            for (; j < n; j++) {
+                y_ptr[j] += *aj * xtemp;
+                aj += lda;
+            }
+
+        } else {
+            for (j = 0; j < n; j++) {
+                *y_ptr += *aj * xtemp;
+                y_ptr += inc_y;
+                aj += lda;
+            }
+
+        }
+    
+    }
+                a_ptr += 1; 
+    }
+    return (0);
+
+}
+
diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S
index d1e60da6c..f9320d516 100644
--- a/kernel/power/zgemm_kernel_power9.S
+++ b/kernel/power/zgemm_kernel_power9.S
@@ -1,245 +1,245 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#define ASSEMBLER
-#include "common.h"
-#include "def_vsx.h"
-
-#define LOAD	ld
- 
-#define STACKSIZE 512
-
-#define FZERO	312+192(SP)
-
-#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
-
-#define	M	r3
-#define	N	r4
-#define	K	r5
-
- 
-#define A	r8
-#define	B	r9
-#define	C	r10
-#define	LDC	r6
-#define OFFSET	r7
- 
- 
-
-#define o0	0
-#define alpha_r vs30
-#define alpha_i vs31
-
-#define VECSAVE r11
-
-#define FRAMEPOINTER r12
-
-#define T10 r14
-
-#define L	r15
-#define T8	r16
-#define T5	r17
-#define T2	r19
-#define TEMP_REG	r20
-#define	T6	r21
-#define	I	r22
-#define J	r23
-#define AO	r24
-#define	BO	r25
-#define	CO	r26
-#define T7	r27
-#define	T3	r28
-#define T4	r29
-
-#define PRE	r30
-#define T1  	r31
-
-#ifndef NEEDPARAM
-
-	PROLOGUE
-	PROFCODE
-
-	mr      FRAMEPOINTER, SP
-    addi    SP, SP, -STACKSIZE 
-    mflr    r0
-	stfd	f14,    0(SP)
-	stfd	f15,    8(SP)
-	stfd	f16,   16(SP)
-	stfd	f17,   24(SP)
-
-	stfd	f18,   32(SP)
-	stfd	f19,   40(SP)
-	stfd	f20,   48(SP)
-	stfd	f21,   56(SP)
-
-	stfd	f22,   64(SP)
-	stfd	f23,   72(SP)
-	stfd	f24,   80(SP)
-	stfd	f25,   88(SP)
-
-	stfd	f26,   96(SP)
-	stfd	f27,  104(SP)
-	stfd	f28,  112(SP)
-	stfd	f29,  120(SP)
-
-	stfd	f30,  128(SP)
-	stfd	f31,  136(SP)
-
-    xxspltd  alpha_r,vs1,0  /*copy from register f1 */
-    xxspltd  alpha_i,vs2,0  /*copy from register f2 */
- 
-	std	r31,  144(SP)
-	std	r30,  152(SP)
-	std	r29,  160(SP)
-	std	r28,  168(SP)
-	std	r27,  176(SP)
-	std	r26,  184(SP)
-	std	r25,  192(SP)
-	std	r24,  200(SP)
-	std	r23,  208(SP)
-	std	r22,  216(SP)
-	std	r21,  224(SP)
-	std	r20,  232(SP)
-	std	r19,  240(SP)
-	std	r18,  248(SP)
-	std	r17,  256(SP)
-	std	r16,  264(SP)
-	std	r15,  272(SP)
-	std	r14,  280(SP)
- 
- 
-    stxv    vs52,  288(SP)
-    stxv    vs53,  304(SP)
-    stxv    vs54,  320(SP)
-    stxv    vs55,  336(SP)
-    stxv    vs56,  352(SP)
-    stxv    vs57,  368(SP)
-    stxv    vs58,  384(SP)
-    stxv    vs59,  400(SP)
-    stxv    vs60,  416(SP)
-    stxv    vs61,  432(SP)
-    stxv    vs62,  448(SP)
-    stxv    vs63,  464(SP)
-
-    std    r0, FLINK_SAVE(SP)
- 
-
-#if defined(linux) || defined(__FreeBSD__)
-	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
-#endif
-
-
-#ifdef TRMMKERNEL
-#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
-	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
-#endif 
-#endif
-
-
-#include "zgemm_macros_power9.S"
-
- 
-
-	slwi	LDC, LDC, ZBASE_SHIFT
-	li	PRE,  512 
-    li  r0,   0
- 
-
-#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
-/*negate for this case as we will use addition -1*(a+b) */
-  xvnegdp alpha_r,alpha_r
-  xvnegdp alpha_i,alpha_i
-#endif
-	.align 4
-
-#include "zgemm_logic_power9.S"
-
-L999:
- 
-	lfd	f14,    0(SP)
-	lfd	f15,    8(SP)
-	lfd	f16,   16(SP)
-	lfd	f17,   24(SP)
-
-	lfd	f18,   32(SP)
-	lfd	f19,   40(SP)
-	lfd	f20,   48(SP)
-	lfd	f21,   56(SP)
-
-	lfd	f22,   64(SP)
-	lfd	f23,   72(SP)
-	lfd	f24,   80(SP)
-	lfd	f25,   88(SP)
-
-	lfd	f26,   96(SP)
-	lfd	f27,  104(SP)
-	lfd	f28,  112(SP)
-	lfd	f29,  120(SP)
-
-	lfd	f30,  128(SP)
-	lfd	f31,  136(SP)
-
- 
-	ld	r31,  144(SP)
-	ld	r30,  152(SP)
-	ld	r29,  160(SP)
-	ld	r28,  168(SP)
-	ld	r27,  176(SP)
-	ld	r26,  184(SP)
-	ld	r25,  192(SP)
-	ld	r24,  200(SP)
-	ld	r23,  208(SP)
-	ld	r22,  216(SP)
-	ld	r21,  224(SP)
-	ld	r20,  232(SP)
-	ld	r19,  240(SP)
-	ld	r18,  248(SP)
-	ld	r17,  256(SP)
-	ld	r16,  264(SP)
-	ld	r15,  272(SP)
-	ld	r14,  280(SP)
-
-	ld    r0, 	 FLINK_SAVE(SP)	
- 
-    lxv    vs52,  288(SP)
-    lxv    vs53,  304(SP)
-    lxv    vs54,  320(SP)
-    lxv    vs55,  336(SP)
-    lxv    vs56,  352(SP)
-    lxv    vs57,  368(SP)
-    lxv    vs58,  384(SP) 
-    lxv    vs59,  400(SP)
-	mtlr r0
-    lxv    vs60,  416(SP)
-    lxv    vs61,  432(SP) 
-    lxv    vs62,  448(SP)
-    lxv    vs63,  464(SP)
-
-	addi	SP, SP, STACKSIZE 
-	blr
-
-	EPILOGUE
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+#include "common.h"
+#include "def_vsx.h"
+
+#define LOAD	ld
+ 
+#define STACKSIZE 512
+
+#define FZERO	312+192(SP)
+
+#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
+
+#define	M	r3
+#define	N	r4
+#define	K	r5
+
+ 
+#define A	r8
+#define	B	r9
+#define	C	r10
+#define	LDC	r6
+#define OFFSET	r7
+ 
+ 
+
+#define o0	0
+#define alpha_r vs30
+#define alpha_i vs31
+
+#define VECSAVE r11
+
+#define FRAMEPOINTER r12
+
+#define T10 r14
+
+#define L	r15
+#define T8	r16
+#define T5	r17
+#define T2	r19
+#define TEMP_REG	r20
+#define	T6	r21
+#define	I	r22
+#define J	r23
+#define AO	r24
+#define	BO	r25
+#define	CO	r26
+#define T7	r27
+#define	T3	r28
+#define T4	r29
+
+#define PRE	r30
+#define T1  	r31
+
+#ifndef NEEDPARAM
+
+	PROLOGUE
+	PROFCODE
+
+	mr      FRAMEPOINTER, SP
+    addi    SP, SP, -STACKSIZE 
+    mflr    r0
+	stfd	f14,    0(SP)
+	stfd	f15,    8(SP)
+	stfd	f16,   16(SP)
+	stfd	f17,   24(SP)
+
+	stfd	f18,   32(SP)
+	stfd	f19,   40(SP)
+	stfd	f20,   48(SP)
+	stfd	f21,   56(SP)
+
+	stfd	f22,   64(SP)
+	stfd	f23,   72(SP)
+	stfd	f24,   80(SP)
+	stfd	f25,   88(SP)
+
+	stfd	f26,   96(SP)
+	stfd	f27,  104(SP)
+	stfd	f28,  112(SP)
+	stfd	f29,  120(SP)
+
+	stfd	f30,  128(SP)
+	stfd	f31,  136(SP)
+
+    xxspltd  alpha_r,vs1,0  /*copy from register f1 */
+    xxspltd  alpha_i,vs2,0  /*copy from register f2 */
+ 
+	std	r31,  144(SP)
+	std	r30,  152(SP)
+	std	r29,  160(SP)
+	std	r28,  168(SP)
+	std	r27,  176(SP)
+	std	r26,  184(SP)
+	std	r25,  192(SP)
+	std	r24,  200(SP)
+	std	r23,  208(SP)
+	std	r22,  216(SP)
+	std	r21,  224(SP)
+	std	r20,  232(SP)
+	std	r19,  240(SP)
+	std	r18,  248(SP)
+	std	r17,  256(SP)
+	std	r16,  264(SP)
+	std	r15,  272(SP)
+	std	r14,  280(SP)
+ 
+ 
+    stxv    vs52,  288(SP)
+    stxv    vs53,  304(SP)
+    stxv    vs54,  320(SP)
+    stxv    vs55,  336(SP)
+    stxv    vs56,  352(SP)
+    stxv    vs57,  368(SP)
+    stxv    vs58,  384(SP)
+    stxv    vs59,  400(SP)
+    stxv    vs60,  416(SP)
+    stxv    vs61,  432(SP)
+    stxv    vs62,  448(SP)
+    stxv    vs63,  464(SP)
+
+    std    r0, FLINK_SAVE(SP)
+ 
+
+#if defined(linux) || defined(__FreeBSD__)
+	ld	LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
+#endif
+
+
+#ifdef TRMMKERNEL
+#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
+	ld	OFFSET,  FRAMESLOT(1) + 0(FRAMEPOINTER)
+#endif 
+#endif
+
+
+#include "zgemm_macros_power9.S"
+
+ 
+
+	slwi	LDC, LDC, ZBASE_SHIFT
+	li	PRE,  512 
+    li  r0,   0
+ 
+
+#if defined(CC) || defined(CR) || defined(RC) || defined(RR) 
+/*negate for this case as we will use addition -1*(a+b) */
+  xvnegdp alpha_r,alpha_r
+  xvnegdp alpha_i,alpha_i
+#endif
+	.align 4
+
+#include "zgemm_logic_power9.S"
+
+L999:
+ 
+	lfd	f14,    0(SP)
+	lfd	f15,    8(SP)
+	lfd	f16,   16(SP)
+	lfd	f17,   24(SP)
+
+	lfd	f18,   32(SP)
+	lfd	f19,   40(SP)
+	lfd	f20,   48(SP)
+	lfd	f21,   56(SP)
+
+	lfd	f22,   64(SP)
+	lfd	f23,   72(SP)
+	lfd	f24,   80(SP)
+	lfd	f25,   88(SP)
+
+	lfd	f26,   96(SP)
+	lfd	f27,  104(SP)
+	lfd	f28,  112(SP)
+	lfd	f29,  120(SP)
+
+	lfd	f30,  128(SP)
+	lfd	f31,  136(SP)
+
+ 
+	ld	r31,  144(SP)
+	ld	r30,  152(SP)
+	ld	r29,  160(SP)
+	ld	r28,  168(SP)
+	ld	r27,  176(SP)
+	ld	r26,  184(SP)
+	ld	r25,  192(SP)
+	ld	r24,  200(SP)
+	ld	r23,  208(SP)
+	ld	r22,  216(SP)
+	ld	r21,  224(SP)
+	ld	r20,  232(SP)
+	ld	r19,  240(SP)
+	ld	r18,  248(SP)
+	ld	r17,  256(SP)
+	ld	r16,  264(SP)
+	ld	r15,  272(SP)
+	ld	r14,  280(SP)
+
+	ld    r0, 	 FLINK_SAVE(SP)	
+ 
+    lxv    vs52,  288(SP)
+    lxv    vs53,  304(SP)
+    lxv    vs54,  320(SP)
+    lxv    vs55,  336(SP)
+    lxv    vs56,  352(SP)
+    lxv    vs57,  368(SP)
+    lxv    vs58,  384(SP) 
+    lxv    vs59,  400(SP)
+	mtlr r0
+    lxv    vs60,  416(SP)
+    lxv    vs61,  432(SP) 
+    lxv    vs62,  448(SP)
+    lxv    vs63,  464(SP)
+
+	addi	SP, SP, STACKSIZE 
+	blr
+
+	EPILOGUE
 #endif
\ No newline at end of file
diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S
index fe5d8ade2..850b41aff 100644
--- a/kernel/power/zgemm_logic_power9.S
+++ b/kernel/power/zgemm_logic_power9.S
@@ -1,1891 +1,1891 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-#define MY_ALIGN .align 3
-b ZGEMM_L2
-/*                MINI SUBROUTINES                            */      
-/*                2x8 MAIN 128x+2 LOOP                     */      
-
-
-ZGEMM_L2x8_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x8_2 
-    MY_ALIGN
-ZGEMM_L2x8_LOOP:
-/*----------------------------------------*/   
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 256,64,0,0 
-ZGEMM_L2x8_K128:
-/*----------------------------------------*/   
-    KERNEL2x8_L2 256,64,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 256,64,2,0
-    KERNEL2x8_L2 256,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 256,64,4,0
-    KERNEL2x8_L2 256,64,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 256,64,6,0
-    KERNEL2x8_L2 256,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 256,64,8,0
-    KERNEL2x8_L2 256,64,9,0
-    KERNEL2x8_L2 256,64,10,0
-    KERNEL2x8_L2 256,64,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 256,64,12,0
-    KERNEL2x8_L2 256,64,13,0
-    KERNEL2x8_L2 256,64,14,0
-    KERNEL2x8_L2 256,64,15,0  
-    KERNEL2x8_L2 256,64,16,0
-    KERNEL2x8_L2 256,64,17,0 
-    KERNEL2x8_L2 256,64,18,0
-    KERNEL2x8_L2 256,64,19,0  
-    KERNEL2x8_L2 256,64,20,0
-    KERNEL2x8_L2 256,64,21,0 
-    KERNEL2x8_L2 256,64,22,0
-    KERNEL2x8_L2 256,64,23,0   
-    KERNEL2x8_L2 256,64,24,0
-    KERNEL2x8_L2 256,64,25,0
-    KERNEL2x8_L2 256,64,26,0
-    KERNEL2x8_L2 256,64,27,0  
-    KERNEL2x8_L2 256,64,28,0
-    KERNEL2x8_L2 256,64,29,0
-    KERNEL2x8_L2 256,64,30,0
-    KERNEL2x8_L2 256,64,31,0 
-    KERNEL2x8_L2 256,64,32,0
-    KERNEL2x8_L2 256,64,33,0
-    KERNEL2x8_L2 256,64,34,0
-    KERNEL2x8_L2 256,64,35,0 
-    KERNEL2x8_L2 256,64,36,0
-    KERNEL2x8_L2 256,64,37,0
-    KERNEL2x8_L2 256,64,38,0
-    KERNEL2x8_L2 256,64,39,0  
-    KERNEL2x8_L2 256,64,40,0
-    KERNEL2x8_L2 256,64,41,0
-    KERNEL2x8_L2 256,64,42,0
-    KERNEL2x8_L2 256,64,43,0  
-    KERNEL2x8_L2 256,64,44,0
-    KERNEL2x8_L2 256,64,45,0
-    KERNEL2x8_L2 256,64,46,0
-    KERNEL2x8_L2 256,64,47,0 
-    KERNEL2x8_L2 256,64,48,0
-    KERNEL2x8_L2 256,64,49,0 
-    KERNEL2x8_L2 256,64,50,0
-    KERNEL2x8_L2 256,64,51,0  
-    KERNEL2x8_L2 256,64,52,0
-    KERNEL2x8_L2 256,64,53,0 
-    KERNEL2x8_L2 256,64,54,0
-    KERNEL2x8_L2 256,64,55,0  
-    KERNEL2x8_L2 256,64,56,0
-    KERNEL2x8_L2 256,64,57,0
-    KERNEL2x8_L2 256,64,58,0
-    KERNEL2x8_L2 256,64,59,0  
-    KERNEL2x8_L2 256,64,60,0
-    KERNEL2x8_L2 256,64,61,0
-    KERNEL2x8_L2 256,64,62,0 
-    KERNEL2x8_L2 256,64,63,1  
-    bdnz    ZGEMM_L2x8_LOOP
-    MY_ALIGN  
-ZGEMM_L2x8_LOOP_END:
-/*----------------------------------------*/   
-    END2x8_2
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x8_L64_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 256,64,0,0 
-    KERNEL2x8_L2 256,64,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 256,64,2,0
-    KERNEL2x8_L2 256,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 256,64,4,0
-    KERNEL2x8_L2 256,64,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 256,64,6,0
-    KERNEL2x8_L2 256,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 256,64,8,0
-    KERNEL2x8_L2 256,64,9,0
-    KERNEL2x8_L2 256,64,10,0
-    KERNEL2x8_L2 256,64,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 256,64,12,0
-    KERNEL2x8_L2 256,64,13,0
-    KERNEL2x8_L2 256,64,14,0
-    KERNEL2x8_L2 256,64,15,0  
-    KERNEL2x8_L2 256,64,16,0
-    KERNEL2x8_L2 256,64,17,0 
-    KERNEL2x8_L2 256,64,18,0
-    KERNEL2x8_L2 256,64,19,0  
-    KERNEL2x8_L2 256,64,20,0
-    KERNEL2x8_L2 256,64,21,0 
-    KERNEL2x8_L2 256,64,22,0
-    KERNEL2x8_L2 256,64,23,0   
-    KERNEL2x8_L2 256,64,24,0
-    KERNEL2x8_L2 256,64,25,0
-    KERNEL2x8_L2 256,64,26,0
-    KERNEL2x8_L2 256,64,27,0  
-    KERNEL2x8_L2 256,64,28,0
-    KERNEL2x8_L2 256,64,29,0
-    KERNEL2x8_L2 256,64,30,0
-    KERNEL2x8_E2 256,64,31,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x8_L32_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 256,64,0,0 
-    KERNEL2x8_L2 256,64,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 256,64,2,0
-    KERNEL2x8_L2 256,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 256,64,4,0
-    KERNEL2x8_L2 256,64,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 256,64,6,0
-    KERNEL2x8_L2 256,64,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL2x8_L2 256,64,8,0
-    KERNEL2x8_L2 256,64,9,0
-    KERNEL2x8_L2 256,64,10,0
-    KERNEL2x8_L2 256,64,11,0  
-    dcbt    BO, T4
-    KERNEL2x8_L2 256,64,12,0
-    KERNEL2x8_L2 256,64,13,0
-    KERNEL2x8_L2 256,64,14,0
-    KERNEL2x8_E2 256,64,15,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x8_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x8_2 
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL2x8_L2 256,64,0,0 
-    KERNEL2x8_L2 256,64,1,0
-    dcbt    AO, T2  
-    KERNEL2x8_L2 256,64,2,0
-    KERNEL2x8_L2 256,64,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL2x8_L2 256,64,4,0
-    KERNEL2x8_L2 256,64,5,0
-    dcbt    AO, T4  
-    KERNEL2x8_L2 256,64,6,0
-    KERNEL2x8_E2 256,64,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x4_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x4_2  
-    MY_ALIGN
-ZGEMM_L2x4_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x4_L2 128,64,0,0
-ZGEMM_L2x4_K32:
-/*----------------------------------------*/   
-    KERNEL2x4_L2 128,64,1,0   
-    KERNEL2x4_L2 128,64,2,0
-    KERNEL2x4_L2 128,64,3,0  
-    KERNEL2x4_L2 128,64,4,0
-    KERNEL2x4_L2 128,64,5,0 
-    KERNEL2x4_L2 128,64,6,0
-    KERNEL2x4_L2 128,64,7,0
-    KERNEL2x4_L2 128,64,8,0
-    KERNEL2x4_L2 128,64,9,0   
-    KERNEL2x4_L2 128,64,10,0
-    KERNEL2x4_L2 128,64,11,0  
-    KERNEL2x4_L2 128,64,12,0
-    KERNEL2x4_L2 128,64,13,0 
-    KERNEL2x4_L2 128,64,14,0
-    KERNEL2x4_L2 128,64,15,1    
-    bdnz    ZGEMM_L2x4_LOOP
-    MY_ALIGN  
-ZGEMM_L2x4_LOOP_END:
-/*----------------------------------------*/   
-    END2x4_2 
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x4_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x4_2
-    KERNEL2x4_L2 128,64,0,0
-    KERNEL2x4_L2 128,64,1,0   
-    KERNEL2x4_L2 128,64,2,0
-    KERNEL2x4_L2 128,64,3,0  
-    KERNEL2x4_L2 128,64,4,0
-    KERNEL2x4_L2 128,64,5,0 
-    KERNEL2x4_L2 128,64,6,0
-    KERNEL2x4_E2 128,64,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x4_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x4_2
-    KERNEL2x4_L2 128,64,0,0
-    KERNEL2x4_L2 128,64,1,0   
-    KERNEL2x4_L2 128,64,2,0
-    KERNEL2x4_E2 128,64,3,1 
-    blr
-
-
-ZGEMM_2x2_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x2_2  
-    MY_ALIGN 
-ZGEMM_L2x2_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x2_L2 64,64,0,0 
-ZGEMM_L2x2_K32:
-/*----------------------------------------*/   
-    KERNEL2x2_L2 64,64,1,0  
-    KERNEL2x2_L2 64,64,2,0
-    KERNEL2x2_L2 64,64,3,0  
-    KERNEL2x2_L2 64,64,4,0
-    KERNEL2x2_L2 64,64,5,0 
-    KERNEL2x2_L2 64,64,6,0
-    KERNEL2x2_L2 64,64,7,0
-    KERNEL2x2_L2 64,64,8,0
-    KERNEL2x2_L2 64,64,9,0  
-    KERNEL2x2_L2 64,64,10,0
-    KERNEL2x2_L2 64,64,11,0  
-    KERNEL2x2_L2 64,64,12,0
-    KERNEL2x2_L2 64,64,13,0 
-    KERNEL2x2_L2 64,64,14,0
-    KERNEL2x2_L2 64,64,15,1   
-    bdnz    ZGEMM_L2x2_LOOP
-    MY_ALIGN  
-
-
-ZGEMM_L2x2_LOOP_END:
-/*----------------------------------------*/   
-    END2x2_2 
-    blr
-    MY_ALIGN
-ZGEMM_2x2_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x2_2
-    KERNEL2x2_L2 64,64,0,0
-    KERNEL2x2_L2 64,64,1,0  
-    KERNEL2x2_L2 64,64,2,0
-    KERNEL2x2_L2 64,64,3,0  
-    KERNEL2x2_L2 64,64,4,0
-    KERNEL2x2_L2 64,64,5,0 
-    KERNEL2x2_L2 64,64,6,0
-    KERNEL2x2_E2 64,64,7,1
-    blr
-    MY_ALIGN
-ZGEMM_2x2_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x2_2
-    KERNEL2x2_L2 64,64,0,0
-    KERNEL2x2_L2 64,64,1,0  
-    KERNEL2x2_L2 64,64,2,0
-    KERNEL2x2_E2 64,64,3,1  
-    blr
-
-
-ZGEMM_2x1_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD2x1_2  
-    MY_ALIGN
-ZGEMM_L2x1_LOOP:
-/*----------------------------------------*/   
-    KERNEL2x1_L2 32,64,0,0 
-ZGEMM_L2x1_K32:
-/*----------------------------------------*/   
-    KERNEL2x1_L2 32,64,1,0  
-    KERNEL2x1_L2 32,64,2,0
-    KERNEL2x1_L2 32,64,3,0  
-    KERNEL2x1_L2 32,64,4,0
-    KERNEL2x1_L2 32,64,5,0 
-    KERNEL2x1_L2 32,64,6,0
-    KERNEL2x1_L2 32,64,7,0
-    KERNEL2x1_L2 32,64,8,0
-    KERNEL2x1_L2 32,64,9,0  
-    KERNEL2x1_L2 32,64,10,0
-    KERNEL2x1_L2 32,64,11,0  
-    KERNEL2x1_L2 32,64,12,0
-    KERNEL2x1_L2 32,64,13,0 
-    KERNEL2x1_L2 32,64,14,0
-    KERNEL2x1_L2 32,64,15,1   
-    bdnz    ZGEMM_L2x1_LOOP
-    MY_ALIGN  
-ZGEMM_L2x1_LOOP_END:
-/*----------------------------------------*/   
-    END2x1_2 
-    blr
-
-    MY_ALIGN
-ZGEMM_2x1_L16_SUB:
-/*----------------------------------------*/   
-    LOAD2x1_2
-    KERNEL2x1_L2 32,64,0,0
-    KERNEL2x1_L2 32,64,1,0  
-    KERNEL2x1_L2 32,64,2,0
-    KERNEL2x1_L2 32,64,3,0  
-    KERNEL2x1_L2 32,64,4,0
-    KERNEL2x1_L2 32,64,5,0 
-    KERNEL2x1_L2 32,64,6,0
-    KERNEL2x1_E2 32,64,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_2x1_L8_SUB:
-/*----------------------------------------*/   
-    LOAD2x1_2
-    KERNEL2x1_L2 32,64,0,0
-    KERNEL2x1_L2 32,64,1,0  
-    KERNEL2x1_L2 32,64,2,0
-    KERNEL2x1_E2 32,64,3,1  
-    blr
-
-
-
-/*             MAIN LOOP BEGINS               */   
-    MY_ALIGN
-
-
-ZGEMM_L2:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    neg TEMP_REG, OFFSET 
-#endif   
-    srawi.    J,  N,  1
-    ble   ZGEMM_L2_END
-
-
-ZGEMM_L2_BEGIN:
-/*----------------------------------------*/   
-    mr    CO, C
-    slwi    T1, LDC , 1     
-    add     T2,C,LDC    
-    mr    AO, A  
-    add   C,  C,  T1
-#if defined(TRMMKERNEL) && defined(LEFT)   
-    mr TEMP_REG, OFFSET  /*off = offset;*/
-#endif     
-    srawi.    I,  M,  3
-    ble   ZGEMM_L2x8_END
-    dcbt    CO,r0  /*just prefetch*/
-    dcbt    T2,r0    
-
-
-ZGEMM_L2x8_BEGIN:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
-#else    
-    mr    BO, B  
-    dcbt    B,  r0  
-#endif     
-    dcbt    AO, r0
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
-    mr T1, T6
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512   
-    srawi.   T8, T1, 7 /**(T11-2) % 128x */
-#else   
-    mr T1, K
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512 
-    srawi.   T8, T1, 7 /**(K-2) % 128x */
-#endif   
-    ZERO2x8  
-    ble   ZGEMM_L2x8_SUB0
-    bl ZGEMM_L2x8_LMAIN_SUB
-    andi.   L,  T1, 127
-    ble   ZGEMM_L2x8_SAVE
-    b   ZGEMM_L2x8_SUB2
-
-
-ZGEMM_L2x8_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 255
-    cmpwi   T6,129
-#else   
-    andi.   L,  K,  255
-    cmpwi   K,129
-#endif       
-    li T8,1
-    bne CMP2x8_128K
-    addi BO,BO,-32
-    addi AO,AO,-128 
-    LOAD2x8O 128,32 
-    END2x8_WITHOUT_ADD   
-    LOAD2x8_2O  256, 64 
-    mtctr   T8    
-    bl ZGEMM_L2x8_K128   
-    b ZGEMM_L2x8_SAVE  
-    CMP2x8_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne ZGEMM_L2x8_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-256   
-    LOAD2x8_2O 256,64
-    bl ZGEMM_L2x8_K128   
-    b ZGEMM_L2x8_SAVE 
-    MY_ALIGN
-
-
-ZGEMM_L2x8_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 64
-    ble ZGEMM_L2x8_SUB2_32
-    bl  ZGEMM_2x8_L64_SUB
-    MY_ALIGN
-
-
-ZGEMM_L2x8_SUB2_32:
-/*----------------------------------------*/   
-    andi.      T1,L, 32
-    ble ZGEMM_L2x8_SUB2_16    
-    bl  ZGEMM_2x8_L32_SUB
-    MY_ALIGN 
-
-
-ZGEMM_L2x8_SUB2_16:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L2x8_SUB2_8
-    bl  ZGEMM_2x8_L16_SUB  
-    MY_ALIGN    
-
-
-ZGEMM_L2x8_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L2x8_SUB2_4
-    LOAD2x8_2
-    KERNEL2x8_L2  256,64, 0,0
-    KERNEL2x8_L2  256,64, 1,0
-    KERNEL2x8_L2  256,64, 2,0
-    KERNEL2x8_E2  256,64, 3,1
-    MY_ALIGN   
-
-
-ZGEMM_L2x8_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L2x8_SUB2_2
-    LOAD2x8_2
-    KERNEL2x8_L2  256,64, 0,0
-    KERNEL2x8_E2  256,64, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L2x8_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L2x8_SUB2_1
-    LOAD2x8_2 
-    KERNEL2x8_E2  256,64, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L2x8_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L2x8_SAVE 
-    KERNEL2x8
-
-
-ZGEMM_L2x8_SAVE:
-/*----------------------------------------*/   
-    addic.    I,  I,  -1
-    SAVE2x8
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
-#endif     
-    bgt   ZGEMM_L2x8_BEGIN
-    andi.   T2, M,  7
-    ble   ZGEMM_L2x1_END
-    andi.   T1, M,  4
-    ble   ZGEMM_L2x4_END
-    b   ZGEMM_L2x4_BEGIN
-    MY_ALIGN 
-
-
-ZGEMM_L2x8_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L2x4_BEGIN:
-/*----------------------------------------*/   
-    andi.   T2, M,  7
-    ble   ZGEMM_L2x1_END
-    andi.   T1, M,  4
-    ble   ZGEMM_L2x4_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x4
-    ble   ZGEMM_L2x4_SUB0 
-    bl ZGEMM_2x4_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L2x4_SAVE
-    b    ZGEMM_L2x4_SUB2
-
-
-ZGEMM_L2x4_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x4_32K
-    addi BO,BO,-32
-    addi AO,AO,-64  
-    LOAD2x4O 64,32 
-    END2x4_WITHOUT_ADD   
-    LOAD2x4_2O  128, 64 
-    mtctr   T8    
-    bl ZGEMM_L2x4_K32   
-    b ZGEMM_L2x4_SAVE  
-    CMP2x4_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L2x4_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-128   
-    LOAD2x4_2O 128,64
-    bl ZGEMM_L2x4_K32   
-    b ZGEMM_L2x4_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L2x4_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L2x4_SUB2_8
-    bl  ZGEMM_2x4_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L2x4_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L2x4_SUB2_4
-    bl ZGEMM_2x4_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L2x4_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L2x4_SUB2_2
-    LOAD2x4_2
-    KERNEL2x4_L2  128,64, 0,0
-    KERNEL2x4_E2  128,64, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L2x4_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L2x4_SUB2_1
-    LOAD2x4_2
-    KERNEL2x4_E2  128,64, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L2x4_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L2x4_SAVE 
-    KERNEL2x4
-
-
-ZGEMM_L2x4_SAVE:
-/*----------------------------------------*/   
-    SAVE2x4
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
-#endif     
-
-
-ZGEMM_L2x4_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L2x2_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  2
-    ble   ZGEMM_L2x2_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x2
-    ble   ZGEMM_L2x2_SUB0 
-    bl ZGEMM_2x2_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L2x2_SAVE
-    b   ZGEMM_L2x2_SUB2
-
-
-ZGEMM_L2x2_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x2_32K
-    addi BO,BO,-32
-    addi AO,AO,-32  
-    LOAD2x2O 32,32 
-    END2x2_WITHOUT_ADD   
-    LOAD2x2_2O  64, 64  
-    mtctr   T8    
-    bl ZGEMM_L2x2_K32   
-    b ZGEMM_L2x2_SAVE  
-    CMP2x2_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L2x2_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-64   
-    LOAD2x2_2O 64,64
-    bl ZGEMM_L2x2_K32   
-    b ZGEMM_L2x2_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L2x2_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L2x2_SUB2_8
-    bl ZGEMM_2x2_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L2x2_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L2x2_SUB2_4
-    bl ZGEMM_2x2_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L2x2_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L2x2_SUB2_2
-    LOAD2x2_2
-    KERNEL2x2_L2  64,64, 0,0
-    KERNEL2x2_E2  64,64, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L2x2_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L2x2_SUB2_1
-    LOAD2x2_2
-    KERNEL2x2_E2  64,64, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L2x2_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L2x2_SAVE 
-    KERNEL2x2
-
-
-ZGEMM_L2x2_SAVE:
-/*----------------------------------------*/   
-    SAVE2x2
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
-#endif     
-
-
-ZGEMM_L2x2_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L2x1_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  1
-    ble   ZGEMM_L2x1_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO2x1
-    ble   ZGEMM_L2x1_SUB0 
-    bl ZGEMM_2x1_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L2x1_SAVE
-    b   ZGEMM_L2x1_SUB2
-
-
-ZGEMM_L2x1_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP2x1_32K
-    addi BO,BO,-32
-    addi AO,AO,-16  
-    LOAD2x1O 16,32 
-    END2x1_WITHOUT_ADD   
-    LOAD2x1_2O  32, 64  
-    mtctr   T8    
-    bl ZGEMM_L2x1_K32   
-    b ZGEMM_L2x1_SAVE  
-    CMP2x1_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L2x1_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-64
-    addi AO,AO,-32   
-    LOAD2x1_2O 32,64
-    bl ZGEMM_L2x1_K32   
-    b ZGEMM_L2x1_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L2x1_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L2x1_SUB2_8
-    bl ZGEMM_2x1_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L2x1_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L2x1_SUB2_4
-    bl ZGEMM_2x1_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L2x1_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L2x1_SUB2_2
-    LOAD2x1_2
-    KERNEL2x1_L2  32,64, 0,0
-    KERNEL2x1_E2  32,64, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L2x1_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L2x1_SUB2_1
-    LOAD2x1_2
-    KERNEL2x1_E2  32,64, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L2x1_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L2x1_SAVE 
-    KERNEL2x1
-
-
-ZGEMM_L2x1_SAVE:
-/*----------------------------------------*/   
-    SAVE2x1
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
-#endif   
-
-
-ZGEMM_L2x1_END:
-/*----------------------------------------*/   
-    slwi    T1, K,  5
-    addic.    J,  J,  -1
-    add   B,  B,  T1
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    addi TEMP_REG, TEMP_REG, 2
-#endif   
-    bgt   ZGEMM_L2_BEGIN
-
-
-ZGEMM_L2_END:
-
-b ZGEMM_L1
-/*                MINI SUBROUTINES                            */      
-/*                1x8 MAIN 128x+2 LOOP                     */      
-
-
-ZGEMM_L1x8_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x8_2 
-    MY_ALIGN
-ZGEMM_L1x8_LOOP:
-/*----------------------------------------*/   
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 256,32,0,0 
-ZGEMM_L1x8_K128:
-/*----------------------------------------*/   
-    KERNEL1x8_L2 256,32,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 256,32,2,0
-    KERNEL1x8_L2 256,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 256,32,4,0
-    KERNEL1x8_L2 256,32,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 256,32,6,0
-    KERNEL1x8_L2 256,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 256,32,8,0
-    KERNEL1x8_L2 256,32,9,0
-    KERNEL1x8_L2 256,32,10,0
-    KERNEL1x8_L2 256,32,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 256,32,12,0
-    KERNEL1x8_L2 256,32,13,0
-    KERNEL1x8_L2 256,32,14,0
-    KERNEL1x8_L2 256,32,15,0  
-    KERNEL1x8_L2 256,32,16,0
-    KERNEL1x8_L2 256,32,17,0 
-    KERNEL1x8_L2 256,32,18,0
-    KERNEL1x8_L2 256,32,19,0  
-    KERNEL1x8_L2 256,32,20,0
-    KERNEL1x8_L2 256,32,21,0 
-    KERNEL1x8_L2 256,32,22,0
-    KERNEL1x8_L2 256,32,23,0   
-    KERNEL1x8_L2 256,32,24,0
-    KERNEL1x8_L2 256,32,25,0
-    KERNEL1x8_L2 256,32,26,0
-    KERNEL1x8_L2 256,32,27,0  
-    KERNEL1x8_L2 256,32,28,0
-    KERNEL1x8_L2 256,32,29,0
-    KERNEL1x8_L2 256,32,30,0
-    KERNEL1x8_L2 256,32,31,0 
-    KERNEL1x8_L2 256,32,32,0
-    KERNEL1x8_L2 256,32,33,0
-    KERNEL1x8_L2 256,32,34,0
-    KERNEL1x8_L2 256,32,35,0 
-    KERNEL1x8_L2 256,32,36,0
-    KERNEL1x8_L2 256,32,37,0
-    KERNEL1x8_L2 256,32,38,0
-    KERNEL1x8_L2 256,32,39,0  
-    KERNEL1x8_L2 256,32,40,0
-    KERNEL1x8_L2 256,32,41,0
-    KERNEL1x8_L2 256,32,42,0
-    KERNEL1x8_L2 256,32,43,0  
-    KERNEL1x8_L2 256,32,44,0
-    KERNEL1x8_L2 256,32,45,0
-    KERNEL1x8_L2 256,32,46,0
-    KERNEL1x8_L2 256,32,47,0 
-    KERNEL1x8_L2 256,32,48,0
-    KERNEL1x8_L2 256,32,49,0 
-    KERNEL1x8_L2 256,32,50,0
-    KERNEL1x8_L2 256,32,51,0  
-    KERNEL1x8_L2 256,32,52,0
-    KERNEL1x8_L2 256,32,53,0 
-    KERNEL1x8_L2 256,32,54,0
-    KERNEL1x8_L2 256,32,55,0  
-    KERNEL1x8_L2 256,32,56,0
-    KERNEL1x8_L2 256,32,57,0
-    KERNEL1x8_L2 256,32,58,0
-    KERNEL1x8_L2 256,32,59,0  
-    KERNEL1x8_L2 256,32,60,0
-    KERNEL1x8_L2 256,32,61,0
-    KERNEL1x8_L2 256,32,62,0 
-    KERNEL1x8_L2 256,32,63,1  
-    bdnz    ZGEMM_L1x8_LOOP
-    MY_ALIGN  
-ZGEMM_L1x8_LOOP_END:
-/*----------------------------------------*/   
-    END1x8_2
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x8_L64_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 256,32,0,0 
-    KERNEL1x8_L2 256,32,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 256,32,2,0
-    KERNEL1x8_L2 256,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 256,32,4,0
-    KERNEL1x8_L2 256,32,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 256,32,6,0
-    KERNEL1x8_L2 256,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 256,32,8,0
-    KERNEL1x8_L2 256,32,9,0
-    KERNEL1x8_L2 256,32,10,0
-    KERNEL1x8_L2 256,32,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 256,32,12,0
-    KERNEL1x8_L2 256,32,13,0
-    KERNEL1x8_L2 256,32,14,0
-    KERNEL1x8_L2 256,32,15,0  
-    KERNEL1x8_L2 256,32,16,0
-    KERNEL1x8_L2 256,32,17,0 
-    KERNEL1x8_L2 256,32,18,0
-    KERNEL1x8_L2 256,32,19,0  
-    KERNEL1x8_L2 256,32,20,0
-    KERNEL1x8_L2 256,32,21,0 
-    KERNEL1x8_L2 256,32,22,0
-    KERNEL1x8_L2 256,32,23,0   
-    KERNEL1x8_L2 256,32,24,0
-    KERNEL1x8_L2 256,32,25,0
-    KERNEL1x8_L2 256,32,26,0
-    KERNEL1x8_L2 256,32,27,0  
-    KERNEL1x8_L2 256,32,28,0
-    KERNEL1x8_L2 256,32,29,0
-    KERNEL1x8_L2 256,32,30,0
-    KERNEL1x8_E2 256,32,31,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x8_L32_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2  
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 256,32,0,0 
-    KERNEL1x8_L2 256,32,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 256,32,2,0
-    KERNEL1x8_L2 256,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 256,32,4,0
-    KERNEL1x8_L2 256,32,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 256,32,6,0
-    KERNEL1x8_L2 256,32,7,0  
-    dcbt    AO, T5  
-    dcbt    BO, T3
-    KERNEL1x8_L2 256,32,8,0
-    KERNEL1x8_L2 256,32,9,0
-    KERNEL1x8_L2 256,32,10,0
-    KERNEL1x8_L2 256,32,11,0  
-    dcbt    BO, T4
-    KERNEL1x8_L2 256,32,12,0
-    KERNEL1x8_L2 256,32,13,0
-    KERNEL1x8_L2 256,32,14,0
-    KERNEL1x8_E2 256,32,15,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x8_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x8_2 
-    dcbt    AO, PRE
-    dcbt    BO, PRE
-    KERNEL1x8_L2 256,32,0,0 
-    KERNEL1x8_L2 256,32,1,0
-    dcbt    AO, T2  
-    KERNEL1x8_L2 256,32,2,0
-    KERNEL1x8_L2 256,32,3,0 
-    dcbt    AO, T3
-    dcbt    BO, T2
-    KERNEL1x8_L2 256,32,4,0
-    KERNEL1x8_L2 256,32,5,0
-    dcbt    AO, T4  
-    KERNEL1x8_L2 256,32,6,0
-    KERNEL1x8_E2 256,32,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x4_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x4_2  
-    MY_ALIGN
-
-
-ZGEMM_L1x4_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x4_L2 128,32,0,0
-
-
-ZGEMM_L1x4_K32:
-/*----------------------------------------*/   
-    KERNEL1x4_L2 128,32,1,0   
-    KERNEL1x4_L2 128,32,2,0
-    KERNEL1x4_L2 128,32,3,0  
-    KERNEL1x4_L2 128,32,4,0
-    KERNEL1x4_L2 128,32,5,0 
-    KERNEL1x4_L2 128,32,6,0
-    KERNEL1x4_L2 128,32,7,0
-    KERNEL1x4_L2 128,32,8,0
-    KERNEL1x4_L2 128,32,9,0   
-    KERNEL1x4_L2 128,32,10,0
-    KERNEL1x4_L2 128,32,11,0  
-    KERNEL1x4_L2 128,32,12,0
-    KERNEL1x4_L2 128,32,13,0 
-    KERNEL1x4_L2 128,32,14,0
-    KERNEL1x4_L2 128,32,15,1    
-    bdnz    ZGEMM_L1x4_LOOP
-    MY_ALIGN  
-
-
-ZGEMM_L1x4_LOOP_END:
-/*----------------------------------------*/   
-    END1x4_2 
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x4_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x4_2
-    KERNEL1x4_L2 128,32,0,0
-    KERNEL1x4_L2 128,32,1,0   
-    KERNEL1x4_L2 128,32,2,0
-    KERNEL1x4_L2 128,32,3,0  
-    KERNEL1x4_L2 128,32,4,0
-    KERNEL1x4_L2 128,32,5,0 
-    KERNEL1x4_L2 128,32,6,0
-    KERNEL1x4_E2 128,32,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x4_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x4_2
-    KERNEL1x4_L2 128,32,0,0
-    KERNEL1x4_L2 128,32,1,0   
-    KERNEL1x4_L2 128,32,2,0
-    KERNEL1x4_E2 128,32,3,1  
-    blr
-
-
-ZGEMM_1x2_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x2_2  
-    MY_ALIGN
-
-
-ZGEMM_L1x2_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x2_L2 64,32,0,0
-
-
-ZGEMM_L1x2_K32:
-/*----------------------------------------*/   
-    KERNEL1x2_L2 64,32,1,0  
-    KERNEL1x2_L2 64,32,2,0
-    KERNEL1x2_L2 64,32,3,0  
-    KERNEL1x2_L2 64,32,4,0
-    KERNEL1x2_L2 64,32,5,0 
-    KERNEL1x2_L2 64,32,6,0
-    KERNEL1x2_L2 64,32,7,0
-    KERNEL1x2_L2 64,32,8,0
-    KERNEL1x2_L2 64,32,9,0  
-    KERNEL1x2_L2 64,32,10,0
-    KERNEL1x2_L2 64,32,11,0  
-    KERNEL1x2_L2 64,32,12,0
-    KERNEL1x2_L2 64,32,13,0 
-    KERNEL1x2_L2 64,32,14,0
-    KERNEL1x2_L2 64,32,15,1   
-    bdnz    ZGEMM_L1x2_LOOP
-    MY_ALIGN  
-
-
-ZGEMM_L1x2_LOOP_END:
-/*----------------------------------------*/   
-    END1x2_2 
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x2_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x2_2
-    KERNEL1x2_L2 64,32,0,0
-    KERNEL1x2_L2 64,32,1,0  
-    KERNEL1x2_L2 64,32,2,0
-    KERNEL1x2_L2 64,32,3,0  
-    KERNEL1x2_L2 64,32,4,0
-    KERNEL1x2_L2 64,32,5,0 
-    KERNEL1x2_L2 64,32,6,0
-    KERNEL1x2_E2 64,32,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x2_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x2_2
-    KERNEL1x2_L2 64,32,0,0
-    KERNEL1x2_L2 64,32,1,0  
-    KERNEL1x2_L2 64,32,2,0
-    KERNEL1x2_E2 64,32,3,1  
-    blr
-
-
-ZGEMM_1x1_LMAIN_SUB:
-/*----------------------------------------*/   
-    mtctr   T8
-    LOAD1x1_2  
-    MY_ALIGN
-
-
-ZGEMM_L1x1_LOOP:
-/*----------------------------------------*/   
-    KERNEL1x1_L2 32,32,0,0
-
-
-ZGEMM_L1x1_K32:
-/*----------------------------------------*/   
-    KERNEL1x1_L2 32,32,1,0  
-    KERNEL1x1_L2 32,32,2,0
-    KERNEL1x1_L2 32,32,3,0  
-    KERNEL1x1_L2 32,32,4,0
-    KERNEL1x1_L2 32,32,5,0 
-    KERNEL1x1_L2 32,32,6,0
-    KERNEL1x1_L2 32,32,7,0
-    KERNEL1x1_L2 32,32,8,0
-    KERNEL1x1_L2 32,32,9,0  
-    KERNEL1x1_L2 32,32,10,0
-    KERNEL1x1_L2 32,32,11,0  
-    KERNEL1x1_L2 32,32,12,0
-    KERNEL1x1_L2 32,32,13,0 
-    KERNEL1x1_L2 32,32,14,0
-    KERNEL1x1_L2 32,32,15,1   
-    bdnz    ZGEMM_L1x1_LOOP
-    MY_ALIGN  
-
-
-ZGEMM_L1x1_LOOP_END:
-/*----------------------------------------*/   
-    END1x1_2 
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x1_L16_SUB:
-/*----------------------------------------*/   
-    LOAD1x1_2
-    KERNEL1x1_L2 32,32,0,0
-    KERNEL1x1_L2 32,32,1,0  
-    KERNEL1x1_L2 32,32,2,0
-    KERNEL1x1_L2 32,32,3,0  
-    KERNEL1x1_L2 32,32,4,0
-    KERNEL1x1_L2 32,32,5,0 
-    KERNEL1x1_L2 32,32,6,0
-    KERNEL1x1_E2 32,32,7,1
-    blr
-    MY_ALIGN
-
-
-ZGEMM_1x1_L8_SUB:
-/*----------------------------------------*/   
-    LOAD1x1_2
-    KERNEL1x1_L2 32,32,0,0
-    KERNEL1x1_L2 32,32,1,0  
-    KERNEL1x1_L2 32,32,2,0
-    KERNEL1x1_E2 32,32,3,1  
-    blr
-
-
-/*----------------------N1 BEGINS---------*/
-ZGEMM_L1:
-/*----------------------------------------*/   
-    andi.   T1, N,  1
-    ble   ZGEMM_L1_END
-		
-ZGEMM_L1_BEGIN:
-/*----------------------------------------*/   
-    mr    CO, C
-   
-    add     T2,C,LDC    
-    mr    AO, A  
-    add   C,  C,  T1
-#if defined(TRMMKERNEL) && defined(LEFT)   
-    mr TEMP_REG, OFFSET  /*off = offset;*/
-#endif     
-    srawi.    I,  M,  3
-    ble   ZGEMM_L1x8_END
-    dcbt    CO,r0  /*just prefetch*/
-    dcbt    T2,r0    
-
-
-ZGEMM_L1x8_BEGIN:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
-#else    
-    mr    BO, B  
-    dcbt    B,  r0  
-#endif     
-    dcbt    AO, r0
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
-    mr T1, T6
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512   
-    srawi.   T8, T1, 7 /**(T11-2) % 128x */
-#else   
-    mr T1, K
-/* TEMPS FOR PREFETCH */   
-    li T2, 1024
-    li T3, 1024+512
-    addi T1,T1, -2
-/* TEMPS FOR PREFETCH */     
-    li T4, 2048
-    li T5, 2048+512 
-    srawi.   T8, T1, 7 /**(K-2) % 128x */
-#endif   
-    ZERO1x8  
-    ble   ZGEMM_L1x8_SUB0
-    bl ZGEMM_L1x8_LMAIN_SUB
-    andi.   L,  T1, 127
-    ble   ZGEMM_L1x8_SAVE
-    b   ZGEMM_L1x8_SUB2
-
-
-ZGEMM_L1x8_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 255
-    cmpwi   T6,129
-#else   
-    andi.   L,  K,  255
-    cmpwi   K,129
-#endif       
-    li T8,1
-    bne CMP1x8_128K
-    addi BO,BO,-16
-    addi AO,AO,-128 
-    LOAD1x8O 128,16 
-    END1x8_WITHOUT_ADD   
-    LOAD1x8_2O  256, 32 
-    mtctr   T8    
-    bl ZGEMM_L1x8_K128   
-    b ZGEMM_L1x8_SAVE  
-    CMP1x8_128K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,128
-#else    
-    cmpwi   K,128
-#endif        
-    bne ZGEMM_L1x8_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-256   
-    LOAD1x8_2O 256,32
-    bl ZGEMM_L1x8_K128   
-    b ZGEMM_L1x8_SAVE 
-    MY_ALIGN
-
-
-ZGEMM_L1x8_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 64
-    ble ZGEMM_L1x8_SUB2_32
-    bl ZGEMM_1x8_L64_SUB
-    MY_ALIGN
-
-
-ZGEMM_L1x8_SUB2_32:
-/*----------------------------------------*/   
-    andi.      T1,L, 32
-    ble ZGEMM_L1x8_SUB2_16    
-    bl ZGEMM_1x8_L32_SUB
-    MY_ALIGN 
-
-
-ZGEMM_L1x8_SUB2_16:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L1x8_SUB2_8
-    bl ZGEMM_1x8_L16_SUB  
-    MY_ALIGN    
-
-
-ZGEMM_L1x8_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L1x8_SUB2_4
-    LOAD1x8_2
-    KERNEL1x8_L2  256,32, 0,0
-    KERNEL1x8_L2  256,32, 1,0
-    KERNEL1x8_L2  256,32, 2,0
-    KERNEL1x8_E2  256,32, 3,1
-    MY_ALIGN   
-
-
-ZGEMM_L1x8_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L1x8_SUB2_2
-    LOAD1x8_2
-    KERNEL1x8_L2  256,32, 0,0
-    KERNEL1x8_E2  256,32, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L1x8_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L1x8_SUB2_1
-    LOAD1x8_2 
-    KERNEL1x8_E2  256,32, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L1x8_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L1x8_SAVE 
-    KERNEL1x8
-
-
-ZGEMM_L1x8_SAVE:
-/*----------------------------------------*/   
-    addic.    I,  I,  -1
-    SAVE1x8
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
-#endif     
-    bgt   ZGEMM_L1x8_BEGIN
-    andi.   T2, M,  7
-    ble   ZGEMM_L1x1_END
-    andi.   T1, M,  4
-    ble   ZGEMM_L1x4_END
-    b   ZGEMM_L1x4_BEGIN
-    MY_ALIGN 
-
-
-ZGEMM_L1x8_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L1x4_BEGIN:
-/*----------------------------------------*/   
-    andi.   T2, M,  7
-    ble   ZGEMM_L1x1_END
-    andi.   T1, M,  4
-    ble   ZGEMM_L1x4_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO1x4
-    ble   ZGEMM_L1x4_SUB0 
-    bl ZGEMM_1x4_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L1x4_SAVE
-    b   ZGEMM_L1x4_SUB2
-
-
-ZGEMM_L1x4_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x4_32K
-    addi BO,BO,-16
-    addi AO,AO,-64  
-    LOAD1x4O 64,16 
-    END1x4_WITHOUT_ADD   
-    LOAD1x4_2O  128, 32 
-    mtctr   T8    
-    bl ZGEMM_L1x4_K32   
-    b ZGEMM_L1x4_SAVE  
-    CMP1x4_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L1x4_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-128   
-    LOAD1x4_2O 128,32
-    bl ZGEMM_L1x4_K32   
-    b ZGEMM_L1x4_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L1x4_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L1x4_SUB2_8
-    bl ZGEMM_1x4_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L1x4_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L1x4_SUB2_4
-    bl ZGEMM_1x4_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L1x4_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L1x4_SUB2_2
-    LOAD1x4_2
-    KERNEL1x4_L2  128,32, 0,0
-    KERNEL1x4_E2  128,32, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L1x4_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L1x4_SUB2_1
-    LOAD1x4_2
-    KERNEL1x4_E2  128,32, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L1x4_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L1x4_SAVE 
-    KERNEL1x4
-
-
-ZGEMM_L1x4_SAVE:
-/*----------------------------------------*/   
-    SAVE1x4
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
-#endif     
-
-
-ZGEMM_L1x4_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L1x2_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  2
-    ble   ZGEMM_L1x2_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO1x2
-    ble   ZGEMM_L1x2_SUB0 
-    bl ZGEMM_1x2_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L1x2_SAVE
-    b   ZGEMM_L1x2_SUB2
-
-
-ZGEMM_L1x2_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x2_32K
-    addi BO,BO,-16
-    addi AO,AO,-32  
-    LOAD1x2O 32,16 
-    END1x2_WITHOUT_ADD   
-    LOAD1x2_2O  64, 32  
-    mtctr   T8    
-    bl ZGEMM_L1x2_K32   
-    b ZGEMM_L1x2_SAVE  
-    CMP1x2_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L1x2_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-64   
-    LOAD1x2_2O 64,32
-    bl ZGEMM_L1x2_K32   
-    b ZGEMM_L1x2_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L1x2_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L1x2_SUB2_8
-    bl ZGEMM_1x2_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L1x2_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L1x2_SUB2_4
-    bl ZGEMM_1x2_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L1x2_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L1x2_SUB2_2
-    LOAD1x2_2
-    KERNEL1x2_L2  64,32, 0,0
-    KERNEL1x2_E2  64,32, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L1x2_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L1x2_SUB2_1
-    LOAD1x2_2
-    KERNEL1x2_E2  64,32, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L1x2_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L1x2_SAVE 
-    KERNEL1x2
-
-
-ZGEMM_L1x2_SAVE:
-/*----------------------------------------*/   
-    SAVE1x2
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
-#endif     
-
-
-ZGEMM_L1x2_END:
-/*----------------------------------------*/   
-
-
-ZGEMM_L1x1_BEGIN:
-/*----------------------------------------*/   
-    andi.   T1, M,  1
-    ble   ZGEMM_L1x1_END
-#if defined(TRMMKERNEL)   
-    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
-#else    
-    mr    BO, B   
-#endif        
-#if defined(TRMMKERNEL)   
-    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
-    mr T1, T6 
-    addi T1,T1, -2 
-    srawi.   T8, T1, 5 /**(T11-2) % 32x */
-#else   
-    mr T1, K 
-    addi T1,T1, -2
-    srawi.   T8, T1, 5 /**(K-2) % 32x */
-#endif     
-    ZERO1x1
-    ble   ZGEMM_L1x1_SUB0 
-    bl ZGEMM_1x1_LMAIN_SUB
-    andi.   L,  T1, 31
-    ble   ZGEMM_L1x1_SAVE
-    b   ZGEMM_L1x1_SUB2
-
-
-ZGEMM_L1x1_SUB0:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)   
-    andi.   L,  T6, 63
-    cmpwi   T6,33
-#else   
-    andi.   L,  K,  63
-    cmpwi   K,33
-#endif       
-    li T8,1
-    bne CMP1x1_32K
-    addi BO,BO,-16
-    addi AO,AO,-16  
-    LOAD1x1O 16,16 
-    END1x1_WITHOUT_ADD   
-    LOAD1x1_2O  32, 32  
-    mtctr   T8    
-    bl ZGEMM_L1x1_K32   
-    b ZGEMM_L1x1_SAVE  
-    CMP1x1_32K:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL)    
-    cmpwi   T6,32
-#else    
-    cmpwi   K,32
-#endif        
-    bne ZGEMM_L1x1_SUB2 
-    MY_ALIGN   
-    mtctr   T8
-    addi BO,BO,-32
-    addi AO,AO,-32   
-    LOAD1x1_2O 32,32
-    bl ZGEMM_L1x1_K32   
-    b ZGEMM_L1x1_SAVE 
-    MY_ALIGN 
-    MY_ALIGN 
-
-
-ZGEMM_L1x1_SUB2:
-/*----------------------------------------*/   
-    andi.      T1,L, 16
-    ble ZGEMM_L1x1_SUB2_8
-    bl ZGEMM_1x1_L16_SUB  
-    MY_ALIGN
-
-
-ZGEMM_L1x1_SUB2_8:
-/*----------------------------------------*/   
-    andi.      T1,L, 8
-    ble ZGEMM_L1x1_SUB2_4
-    bl ZGEMM_1x1_L8_SUB
-    MY_ALIGN  
-
-
-ZGEMM_L1x1_SUB2_4:
-/*----------------------------------------*/   
-    andi.      T1,L, 4
-    ble ZGEMM_L1x1_SUB2_2
-    LOAD1x1_2
-    KERNEL1x1_L2  32,32, 0,0
-    KERNEL1x1_E2  32,32, 1,1
-    MY_ALIGN
-
-
-ZGEMM_L1x1_SUB2_2:
-/*----------------------------------------*/   
-    andi.      T1,L, 2
-    ble ZGEMM_L1x1_SUB2_1
-    LOAD1x1_2
-    KERNEL1x1_E2  32,32, 0,1
-    MY_ALIGN    
-
-
-ZGEMM_L1x1_SUB2_1:
-/*----------------------------------------*/   
-    andi.      T1,L, 1
-    ble ZGEMM_L1x1_SAVE 
-    KERNEL1x1
-
-
-ZGEMM_L1x1_SAVE:
-/*----------------------------------------*/   
-    SAVE1x1
-#if defined(TRMMKERNEL)    
-    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
-#endif   
-
-
-ZGEMM_L1x1_END:
-/*----------------------------------------*/   
-#if defined(TRMMKERNEL) && !defined(LEFT)   
-    addi TEMP_REG, TEMP_REG, 1
-#endif   
-
-
-ZGEMM_L1_END:
-/*----------------------------------------*/   
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define MY_ALIGN .align 3
+b ZGEMM_L2
+/*                MINI SUBROUTINES                            */      
+/*                2x8 MAIN 128x+2 LOOP                     */      
+
+
+ZGEMM_L2x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x8_2 
+    MY_ALIGN
+ZGEMM_L2x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+ZGEMM_L2x8_K128:
+/*----------------------------------------*/   
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_L2 256,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 256,64,8,0
+    KERNEL2x8_L2 256,64,9,0
+    KERNEL2x8_L2 256,64,10,0
+    KERNEL2x8_L2 256,64,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 256,64,12,0
+    KERNEL2x8_L2 256,64,13,0
+    KERNEL2x8_L2 256,64,14,0
+    KERNEL2x8_L2 256,64,15,0  
+    KERNEL2x8_L2 256,64,16,0
+    KERNEL2x8_L2 256,64,17,0 
+    KERNEL2x8_L2 256,64,18,0
+    KERNEL2x8_L2 256,64,19,0  
+    KERNEL2x8_L2 256,64,20,0
+    KERNEL2x8_L2 256,64,21,0 
+    KERNEL2x8_L2 256,64,22,0
+    KERNEL2x8_L2 256,64,23,0   
+    KERNEL2x8_L2 256,64,24,0
+    KERNEL2x8_L2 256,64,25,0
+    KERNEL2x8_L2 256,64,26,0
+    KERNEL2x8_L2 256,64,27,0  
+    KERNEL2x8_L2 256,64,28,0
+    KERNEL2x8_L2 256,64,29,0
+    KERNEL2x8_L2 256,64,30,0
+    KERNEL2x8_L2 256,64,31,0 
+    KERNEL2x8_L2 256,64,32,0
+    KERNEL2x8_L2 256,64,33,0
+    KERNEL2x8_L2 256,64,34,0
+    KERNEL2x8_L2 256,64,35,0 
+    KERNEL2x8_L2 256,64,36,0
+    KERNEL2x8_L2 256,64,37,0
+    KERNEL2x8_L2 256,64,38,0
+    KERNEL2x8_L2 256,64,39,0  
+    KERNEL2x8_L2 256,64,40,0
+    KERNEL2x8_L2 256,64,41,0
+    KERNEL2x8_L2 256,64,42,0
+    KERNEL2x8_L2 256,64,43,0  
+    KERNEL2x8_L2 256,64,44,0
+    KERNEL2x8_L2 256,64,45,0
+    KERNEL2x8_L2 256,64,46,0
+    KERNEL2x8_L2 256,64,47,0 
+    KERNEL2x8_L2 256,64,48,0
+    KERNEL2x8_L2 256,64,49,0 
+    KERNEL2x8_L2 256,64,50,0
+    KERNEL2x8_L2 256,64,51,0  
+    KERNEL2x8_L2 256,64,52,0
+    KERNEL2x8_L2 256,64,53,0 
+    KERNEL2x8_L2 256,64,54,0
+    KERNEL2x8_L2 256,64,55,0  
+    KERNEL2x8_L2 256,64,56,0
+    KERNEL2x8_L2 256,64,57,0
+    KERNEL2x8_L2 256,64,58,0
+    KERNEL2x8_L2 256,64,59,0  
+    KERNEL2x8_L2 256,64,60,0
+    KERNEL2x8_L2 256,64,61,0
+    KERNEL2x8_L2 256,64,62,0 
+    KERNEL2x8_L2 256,64,63,1  
+    bdnz    ZGEMM_L2x8_LOOP
+    MY_ALIGN  
+ZGEMM_L2x8_LOOP_END:
+/*----------------------------------------*/   
+    END2x8_2
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_L2 256,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 256,64,8,0
+    KERNEL2x8_L2 256,64,9,0
+    KERNEL2x8_L2 256,64,10,0
+    KERNEL2x8_L2 256,64,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 256,64,12,0
+    KERNEL2x8_L2 256,64,13,0
+    KERNEL2x8_L2 256,64,14,0
+    KERNEL2x8_L2 256,64,15,0  
+    KERNEL2x8_L2 256,64,16,0
+    KERNEL2x8_L2 256,64,17,0 
+    KERNEL2x8_L2 256,64,18,0
+    KERNEL2x8_L2 256,64,19,0  
+    KERNEL2x8_L2 256,64,20,0
+    KERNEL2x8_L2 256,64,21,0 
+    KERNEL2x8_L2 256,64,22,0
+    KERNEL2x8_L2 256,64,23,0   
+    KERNEL2x8_L2 256,64,24,0
+    KERNEL2x8_L2 256,64,25,0
+    KERNEL2x8_L2 256,64,26,0
+    KERNEL2x8_L2 256,64,27,0  
+    KERNEL2x8_L2 256,64,28,0
+    KERNEL2x8_L2 256,64,29,0
+    KERNEL2x8_L2 256,64,30,0
+    KERNEL2x8_E2 256,64,31,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_L2 256,64,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL2x8_L2 256,64,8,0
+    KERNEL2x8_L2 256,64,9,0
+    KERNEL2x8_L2 256,64,10,0
+    KERNEL2x8_L2 256,64,11,0  
+    dcbt    BO, T4
+    KERNEL2x8_L2 256,64,12,0
+    KERNEL2x8_L2 256,64,13,0
+    KERNEL2x8_L2 256,64,14,0
+    KERNEL2x8_E2 256,64,15,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL2x8_L2 256,64,0,0 
+    KERNEL2x8_L2 256,64,1,0
+    dcbt    AO, T2  
+    KERNEL2x8_L2 256,64,2,0
+    KERNEL2x8_L2 256,64,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL2x8_L2 256,64,4,0
+    KERNEL2x8_L2 256,64,5,0
+    dcbt    AO, T4  
+    KERNEL2x8_L2 256,64,6,0
+    KERNEL2x8_E2 256,64,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x4_2  
+    MY_ALIGN
+ZGEMM_L2x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 128,64,0,0
+ZGEMM_L2x4_K32:
+/*----------------------------------------*/   
+    KERNEL2x4_L2 128,64,1,0   
+    KERNEL2x4_L2 128,64,2,0
+    KERNEL2x4_L2 128,64,3,0  
+    KERNEL2x4_L2 128,64,4,0
+    KERNEL2x4_L2 128,64,5,0 
+    KERNEL2x4_L2 128,64,6,0
+    KERNEL2x4_L2 128,64,7,0
+    KERNEL2x4_L2 128,64,8,0
+    KERNEL2x4_L2 128,64,9,0   
+    KERNEL2x4_L2 128,64,10,0
+    KERNEL2x4_L2 128,64,11,0  
+    KERNEL2x4_L2 128,64,12,0
+    KERNEL2x4_L2 128,64,13,0 
+    KERNEL2x4_L2 128,64,14,0
+    KERNEL2x4_L2 128,64,15,1    
+    bdnz    ZGEMM_L2x4_LOOP
+    MY_ALIGN  
+ZGEMM_L2x4_LOOP_END:
+/*----------------------------------------*/   
+    END2x4_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 128,64,0,0
+    KERNEL2x4_L2 128,64,1,0   
+    KERNEL2x4_L2 128,64,2,0
+    KERNEL2x4_L2 128,64,3,0  
+    KERNEL2x4_L2 128,64,4,0
+    KERNEL2x4_L2 128,64,5,0 
+    KERNEL2x4_L2 128,64,6,0
+    KERNEL2x4_E2 128,64,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x4_2
+    KERNEL2x4_L2 128,64,0,0
+    KERNEL2x4_L2 128,64,1,0   
+    KERNEL2x4_L2 128,64,2,0
+    KERNEL2x4_E2 128,64,3,1 
+    blr
+
+
+ZGEMM_2x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x2_2  
+    MY_ALIGN 
+ZGEMM_L2x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 64,64,0,0 
+ZGEMM_L2x2_K32:
+/*----------------------------------------*/   
+    KERNEL2x2_L2 64,64,1,0  
+    KERNEL2x2_L2 64,64,2,0
+    KERNEL2x2_L2 64,64,3,0  
+    KERNEL2x2_L2 64,64,4,0
+    KERNEL2x2_L2 64,64,5,0 
+    KERNEL2x2_L2 64,64,6,0
+    KERNEL2x2_L2 64,64,7,0
+    KERNEL2x2_L2 64,64,8,0
+    KERNEL2x2_L2 64,64,9,0  
+    KERNEL2x2_L2 64,64,10,0
+    KERNEL2x2_L2 64,64,11,0  
+    KERNEL2x2_L2 64,64,12,0
+    KERNEL2x2_L2 64,64,13,0 
+    KERNEL2x2_L2 64,64,14,0
+    KERNEL2x2_L2 64,64,15,1   
+    bdnz    ZGEMM_L2x2_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L2x2_LOOP_END:
+/*----------------------------------------*/   
+    END2x2_2 
+    blr
+    MY_ALIGN
+ZGEMM_2x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 64,64,0,0
+    KERNEL2x2_L2 64,64,1,0  
+    KERNEL2x2_L2 64,64,2,0
+    KERNEL2x2_L2 64,64,3,0  
+    KERNEL2x2_L2 64,64,4,0
+    KERNEL2x2_L2 64,64,5,0 
+    KERNEL2x2_L2 64,64,6,0
+    KERNEL2x2_E2 64,64,7,1
+    blr
+    MY_ALIGN
+ZGEMM_2x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x2_2
+    KERNEL2x2_L2 64,64,0,0
+    KERNEL2x2_L2 64,64,1,0  
+    KERNEL2x2_L2 64,64,2,0
+    KERNEL2x2_E2 64,64,3,1  
+    blr
+
+
+ZGEMM_2x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD2x1_2  
+    MY_ALIGN
+ZGEMM_L2x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 32,64,0,0 
+ZGEMM_L2x1_K32:
+/*----------------------------------------*/   
+    KERNEL2x1_L2 32,64,1,0  
+    KERNEL2x1_L2 32,64,2,0
+    KERNEL2x1_L2 32,64,3,0  
+    KERNEL2x1_L2 32,64,4,0
+    KERNEL2x1_L2 32,64,5,0 
+    KERNEL2x1_L2 32,64,6,0
+    KERNEL2x1_L2 32,64,7,0
+    KERNEL2x1_L2 32,64,8,0
+    KERNEL2x1_L2 32,64,9,0  
+    KERNEL2x1_L2 32,64,10,0
+    KERNEL2x1_L2 32,64,11,0  
+    KERNEL2x1_L2 32,64,12,0
+    KERNEL2x1_L2 32,64,13,0 
+    KERNEL2x1_L2 32,64,14,0
+    KERNEL2x1_L2 32,64,15,1   
+    bdnz    ZGEMM_L2x1_LOOP
+    MY_ALIGN  
+ZGEMM_L2x1_LOOP_END:
+/*----------------------------------------*/   
+    END2x1_2 
+    blr
+
+    MY_ALIGN
+ZGEMM_2x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 32,64,0,0
+    KERNEL2x1_L2 32,64,1,0  
+    KERNEL2x1_L2 32,64,2,0
+    KERNEL2x1_L2 32,64,3,0  
+    KERNEL2x1_L2 32,64,4,0
+    KERNEL2x1_L2 32,64,5,0 
+    KERNEL2x1_L2 32,64,6,0
+    KERNEL2x1_E2 32,64,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_2x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD2x1_2
+    KERNEL2x1_L2 32,64,0,0
+    KERNEL2x1_L2 32,64,1,0  
+    KERNEL2x1_L2 32,64,2,0
+    KERNEL2x1_E2 32,64,3,1  
+    blr
+
+
+
+/*             MAIN LOOP BEGINS               */   
+    MY_ALIGN
+
+
+ZGEMM_L2:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    neg TEMP_REG, OFFSET 
+#endif   
+    srawi.    J,  N,  1
+    ble   ZGEMM_L2_END
+
+
+ZGEMM_L2_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+    slwi    T1, LDC , 1     
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   ZGEMM_L2x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+ZGEMM_L2x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,2
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,2
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T11-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO2x8  
+    ble   ZGEMM_L2x8_SUB0
+    bl ZGEMM_L2x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   ZGEMM_L2x8_SAVE
+    b   ZGEMM_L2x8_SUB2
+
+
+ZGEMM_L2x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP2x8_128K
+    addi BO,BO,-32
+    addi AO,AO,-128 
+    LOAD2x8O 128,32 
+    END2x8_WITHOUT_ADD   
+    LOAD2x8_2O  256, 64 
+    mtctr   T8    
+    bl ZGEMM_L2x8_K128   
+    b ZGEMM_L2x8_SAVE  
+    CMP2x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne ZGEMM_L2x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-256   
+    LOAD2x8_2O 256,64
+    bl ZGEMM_L2x8_K128   
+    b ZGEMM_L2x8_SAVE 
+    MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble ZGEMM_L2x8_SUB2_32
+    bl  ZGEMM_2x8_L64_SUB
+    MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble ZGEMM_L2x8_SUB2_16    
+    bl  ZGEMM_2x8_L32_SUB
+    MY_ALIGN 
+
+
+ZGEMM_L2x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x8_SUB2_8
+    bl  ZGEMM_2x8_L16_SUB  
+    MY_ALIGN    
+
+
+ZGEMM_L2x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x8_SUB2_4
+    LOAD2x8_2
+    KERNEL2x8_L2  256,64, 0,0
+    KERNEL2x8_L2  256,64, 1,0
+    KERNEL2x8_L2  256,64, 2,0
+    KERNEL2x8_E2  256,64, 3,1
+    MY_ALIGN   
+
+
+ZGEMM_L2x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x8_SUB2_2
+    LOAD2x8_2
+    KERNEL2x8_L2  256,64, 0,0
+    KERNEL2x8_E2  256,64, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L2x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x8_SUB2_1
+    LOAD2x8_2 
+    KERNEL2x8_E2  256,64, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L2x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x8_SAVE 
+    KERNEL2x8
+
+
+ZGEMM_L2x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    SAVE2x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2
+#endif     
+    bgt   ZGEMM_L2x8_BEGIN
+    andi.   T2, M,  7
+    ble   ZGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L2x4_END
+    b   ZGEMM_L2x4_BEGIN
+    MY_ALIGN 
+
+
+ZGEMM_L2x8_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L2x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   ZGEMM_L2x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L2x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x4
+    ble   ZGEMM_L2x4_SUB0 
+    bl ZGEMM_2x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L2x4_SAVE
+    b    ZGEMM_L2x4_SUB2
+
+
+ZGEMM_L2x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x4_32K
+    addi BO,BO,-32
+    addi AO,AO,-64  
+    LOAD2x4O 64,32 
+    END2x4_WITHOUT_ADD   
+    LOAD2x4_2O  128, 64 
+    mtctr   T8    
+    bl ZGEMM_L2x4_K32   
+    b ZGEMM_L2x4_SAVE  
+    CMP2x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L2x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-128   
+    LOAD2x4_2O 128,64
+    bl ZGEMM_L2x4_K32   
+    b ZGEMM_L2x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L2x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x4_SUB2_8
+    bl  ZGEMM_2x4_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x4_SUB2_4
+    bl ZGEMM_2x4_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L2x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x4_SUB2_2
+    LOAD2x4_2
+    KERNEL2x4_L2  128,64, 0,0
+    KERNEL2x4_E2  128,64, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L2x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x4_SUB2_1
+    LOAD2x4_2
+    KERNEL2x4_E2  128,64, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L2x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x4_SAVE 
+    KERNEL2x4
+
+
+ZGEMM_L2x4_SAVE:
+/*----------------------------------------*/   
+    SAVE2x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2
+#endif     
+
+
+ZGEMM_L2x4_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L2x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   ZGEMM_L2x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x2
+    ble   ZGEMM_L2x2_SUB0 
+    bl ZGEMM_2x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L2x2_SAVE
+    b   ZGEMM_L2x2_SUB2
+
+
+ZGEMM_L2x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x2_32K
+    addi BO,BO,-32
+    addi AO,AO,-32  
+    LOAD2x2O 32,32 
+    END2x2_WITHOUT_ADD   
+    LOAD2x2_2O  64, 64  
+    mtctr   T8    
+    bl ZGEMM_L2x2_K32   
+    b ZGEMM_L2x2_SAVE  
+    CMP2x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L2x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-64   
+    LOAD2x2_2O 64,64
+    bl ZGEMM_L2x2_K32   
+    b ZGEMM_L2x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L2x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x2_SUB2_8
+    bl ZGEMM_2x2_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x2_SUB2_4
+    bl ZGEMM_2x2_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L2x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x2_SUB2_2
+    LOAD2x2_2
+    KERNEL2x2_L2  64,64, 0,0
+    KERNEL2x2_E2  64,64, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L2x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x2_SUB2_1
+    LOAD2x2_2
+    KERNEL2x2_E2  64,64, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L2x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x2_SAVE 
+    KERNEL2x2
+
+
+ZGEMM_L2x2_SAVE:
+/*----------------------------------------*/   
+    SAVE2x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2
+#endif     
+
+
+ZGEMM_L2x2_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L2x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   ZGEMM_L2x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,2
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,2
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO2x1
+    ble   ZGEMM_L2x1_SUB0 
+    bl ZGEMM_2x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L2x1_SAVE
+    b   ZGEMM_L2x1_SUB2
+
+
+ZGEMM_L2x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP2x1_32K
+    addi BO,BO,-32
+    addi AO,AO,-16  
+    LOAD2x1O 16,32 
+    END2x1_WITHOUT_ADD   
+    LOAD2x1_2O  32, 64  
+    mtctr   T8    
+    bl ZGEMM_L2x1_K32   
+    b ZGEMM_L2x1_SAVE  
+    CMP2x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L2x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-64
+    addi AO,AO,-32   
+    LOAD2x1_2O 32,64
+    bl ZGEMM_L2x1_K32   
+    b ZGEMM_L2x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L2x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L2x1_SUB2_8
+    bl ZGEMM_2x1_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L2x1_SUB2_4
+    bl ZGEMM_2x1_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L2x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L2x1_SUB2_2
+    LOAD2x1_2
+    KERNEL2x1_L2  32,64, 0,0
+    KERNEL2x1_E2  32,64, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L2x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L2x1_SUB2_1
+    LOAD2x1_2
+    KERNEL2x1_E2  32,64, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L2x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L2x1_SAVE 
+    KERNEL2x1
+
+
+ZGEMM_L2x1_SAVE:
+/*----------------------------------------*/   
+    SAVE2x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2
+#endif   
+
+
+ZGEMM_L2x1_END:
+/*----------------------------------------*/   
+    slwi    T1, K,  5
+    addic.    J,  J,  -1
+    add   B,  B,  T1
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 2
+#endif   
+    bgt   ZGEMM_L2_BEGIN
+
+
+ZGEMM_L2_END:
+
+b ZGEMM_L1
+/*                MINI SUBROUTINES                            */      
+/*                1x8 MAIN 128x+2 LOOP                     */      
+
+
+ZGEMM_L1x8_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x8_2 
+    MY_ALIGN
+ZGEMM_L1x8_LOOP:
+/*----------------------------------------*/   
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+ZGEMM_L1x8_K128:
+/*----------------------------------------*/   
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_L2 256,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 256,32,8,0
+    KERNEL1x8_L2 256,32,9,0
+    KERNEL1x8_L2 256,32,10,0
+    KERNEL1x8_L2 256,32,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 256,32,12,0
+    KERNEL1x8_L2 256,32,13,0
+    KERNEL1x8_L2 256,32,14,0
+    KERNEL1x8_L2 256,32,15,0  
+    KERNEL1x8_L2 256,32,16,0
+    KERNEL1x8_L2 256,32,17,0 
+    KERNEL1x8_L2 256,32,18,0
+    KERNEL1x8_L2 256,32,19,0  
+    KERNEL1x8_L2 256,32,20,0
+    KERNEL1x8_L2 256,32,21,0 
+    KERNEL1x8_L2 256,32,22,0
+    KERNEL1x8_L2 256,32,23,0   
+    KERNEL1x8_L2 256,32,24,0
+    KERNEL1x8_L2 256,32,25,0
+    KERNEL1x8_L2 256,32,26,0
+    KERNEL1x8_L2 256,32,27,0  
+    KERNEL1x8_L2 256,32,28,0
+    KERNEL1x8_L2 256,32,29,0
+    KERNEL1x8_L2 256,32,30,0
+    KERNEL1x8_L2 256,32,31,0 
+    KERNEL1x8_L2 256,32,32,0
+    KERNEL1x8_L2 256,32,33,0
+    KERNEL1x8_L2 256,32,34,0
+    KERNEL1x8_L2 256,32,35,0 
+    KERNEL1x8_L2 256,32,36,0
+    KERNEL1x8_L2 256,32,37,0
+    KERNEL1x8_L2 256,32,38,0
+    KERNEL1x8_L2 256,32,39,0  
+    KERNEL1x8_L2 256,32,40,0
+    KERNEL1x8_L2 256,32,41,0
+    KERNEL1x8_L2 256,32,42,0
+    KERNEL1x8_L2 256,32,43,0  
+    KERNEL1x8_L2 256,32,44,0
+    KERNEL1x8_L2 256,32,45,0
+    KERNEL1x8_L2 256,32,46,0
+    KERNEL1x8_L2 256,32,47,0 
+    KERNEL1x8_L2 256,32,48,0
+    KERNEL1x8_L2 256,32,49,0 
+    KERNEL1x8_L2 256,32,50,0
+    KERNEL1x8_L2 256,32,51,0  
+    KERNEL1x8_L2 256,32,52,0
+    KERNEL1x8_L2 256,32,53,0 
+    KERNEL1x8_L2 256,32,54,0
+    KERNEL1x8_L2 256,32,55,0  
+    KERNEL1x8_L2 256,32,56,0
+    KERNEL1x8_L2 256,32,57,0
+    KERNEL1x8_L2 256,32,58,0
+    KERNEL1x8_L2 256,32,59,0  
+    KERNEL1x8_L2 256,32,60,0
+    KERNEL1x8_L2 256,32,61,0
+    KERNEL1x8_L2 256,32,62,0 
+    KERNEL1x8_L2 256,32,63,1  
+    bdnz    ZGEMM_L1x8_LOOP
+    MY_ALIGN  
+ZGEMM_L1x8_LOOP_END:
+/*----------------------------------------*/   
+    END1x8_2
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x8_L64_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_L2 256,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 256,32,8,0
+    KERNEL1x8_L2 256,32,9,0
+    KERNEL1x8_L2 256,32,10,0
+    KERNEL1x8_L2 256,32,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 256,32,12,0
+    KERNEL1x8_L2 256,32,13,0
+    KERNEL1x8_L2 256,32,14,0
+    KERNEL1x8_L2 256,32,15,0  
+    KERNEL1x8_L2 256,32,16,0
+    KERNEL1x8_L2 256,32,17,0 
+    KERNEL1x8_L2 256,32,18,0
+    KERNEL1x8_L2 256,32,19,0  
+    KERNEL1x8_L2 256,32,20,0
+    KERNEL1x8_L2 256,32,21,0 
+    KERNEL1x8_L2 256,32,22,0
+    KERNEL1x8_L2 256,32,23,0   
+    KERNEL1x8_L2 256,32,24,0
+    KERNEL1x8_L2 256,32,25,0
+    KERNEL1x8_L2 256,32,26,0
+    KERNEL1x8_L2 256,32,27,0  
+    KERNEL1x8_L2 256,32,28,0
+    KERNEL1x8_L2 256,32,29,0
+    KERNEL1x8_L2 256,32,30,0
+    KERNEL1x8_E2 256,32,31,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x8_L32_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2  
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_L2 256,32,7,0  
+    dcbt    AO, T5  
+    dcbt    BO, T3
+    KERNEL1x8_L2 256,32,8,0
+    KERNEL1x8_L2 256,32,9,0
+    KERNEL1x8_L2 256,32,10,0
+    KERNEL1x8_L2 256,32,11,0  
+    dcbt    BO, T4
+    KERNEL1x8_L2 256,32,12,0
+    KERNEL1x8_L2 256,32,13,0
+    KERNEL1x8_L2 256,32,14,0
+    KERNEL1x8_E2 256,32,15,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x8_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x8_2 
+    dcbt    AO, PRE
+    dcbt    BO, PRE
+    KERNEL1x8_L2 256,32,0,0 
+    KERNEL1x8_L2 256,32,1,0
+    dcbt    AO, T2  
+    KERNEL1x8_L2 256,32,2,0
+    KERNEL1x8_L2 256,32,3,0 
+    dcbt    AO, T3
+    dcbt    BO, T2
+    KERNEL1x8_L2 256,32,4,0
+    KERNEL1x8_L2 256,32,5,0
+    dcbt    AO, T4  
+    KERNEL1x8_L2 256,32,6,0
+    KERNEL1x8_E2 256,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x4_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x4_2  
+    MY_ALIGN
+
+
+ZGEMM_L1x4_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 128,32,0,0
+
+
+ZGEMM_L1x4_K32:
+/*----------------------------------------*/   
+    KERNEL1x4_L2 128,32,1,0   
+    KERNEL1x4_L2 128,32,2,0
+    KERNEL1x4_L2 128,32,3,0  
+    KERNEL1x4_L2 128,32,4,0
+    KERNEL1x4_L2 128,32,5,0 
+    KERNEL1x4_L2 128,32,6,0
+    KERNEL1x4_L2 128,32,7,0
+    KERNEL1x4_L2 128,32,8,0
+    KERNEL1x4_L2 128,32,9,0   
+    KERNEL1x4_L2 128,32,10,0
+    KERNEL1x4_L2 128,32,11,0  
+    KERNEL1x4_L2 128,32,12,0
+    KERNEL1x4_L2 128,32,13,0 
+    KERNEL1x4_L2 128,32,14,0
+    KERNEL1x4_L2 128,32,15,1    
+    bdnz    ZGEMM_L1x4_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x4_LOOP_END:
+/*----------------------------------------*/   
+    END1x4_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x4_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 128,32,0,0
+    KERNEL1x4_L2 128,32,1,0   
+    KERNEL1x4_L2 128,32,2,0
+    KERNEL1x4_L2 128,32,3,0  
+    KERNEL1x4_L2 128,32,4,0
+    KERNEL1x4_L2 128,32,5,0 
+    KERNEL1x4_L2 128,32,6,0
+    KERNEL1x4_E2 128,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x4_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x4_2
+    KERNEL1x4_L2 128,32,0,0
+    KERNEL1x4_L2 128,32,1,0   
+    KERNEL1x4_L2 128,32,2,0
+    KERNEL1x4_E2 128,32,3,1  
+    blr
+
+
+ZGEMM_1x2_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x2_2  
+    MY_ALIGN
+
+
+ZGEMM_L1x2_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 64,32,0,0
+
+
+ZGEMM_L1x2_K32:
+/*----------------------------------------*/   
+    KERNEL1x2_L2 64,32,1,0  
+    KERNEL1x2_L2 64,32,2,0
+    KERNEL1x2_L2 64,32,3,0  
+    KERNEL1x2_L2 64,32,4,0
+    KERNEL1x2_L2 64,32,5,0 
+    KERNEL1x2_L2 64,32,6,0
+    KERNEL1x2_L2 64,32,7,0
+    KERNEL1x2_L2 64,32,8,0
+    KERNEL1x2_L2 64,32,9,0  
+    KERNEL1x2_L2 64,32,10,0
+    KERNEL1x2_L2 64,32,11,0  
+    KERNEL1x2_L2 64,32,12,0
+    KERNEL1x2_L2 64,32,13,0 
+    KERNEL1x2_L2 64,32,14,0
+    KERNEL1x2_L2 64,32,15,1   
+    bdnz    ZGEMM_L1x2_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x2_LOOP_END:
+/*----------------------------------------*/   
+    END1x2_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x2_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 64,32,0,0
+    KERNEL1x2_L2 64,32,1,0  
+    KERNEL1x2_L2 64,32,2,0
+    KERNEL1x2_L2 64,32,3,0  
+    KERNEL1x2_L2 64,32,4,0
+    KERNEL1x2_L2 64,32,5,0 
+    KERNEL1x2_L2 64,32,6,0
+    KERNEL1x2_E2 64,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x2_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x2_2
+    KERNEL1x2_L2 64,32,0,0
+    KERNEL1x2_L2 64,32,1,0  
+    KERNEL1x2_L2 64,32,2,0
+    KERNEL1x2_E2 64,32,3,1  
+    blr
+
+
+ZGEMM_1x1_LMAIN_SUB:
+/*----------------------------------------*/   
+    mtctr   T8
+    LOAD1x1_2  
+    MY_ALIGN
+
+
+ZGEMM_L1x1_LOOP:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 32,32,0,0
+
+
+ZGEMM_L1x1_K32:
+/*----------------------------------------*/   
+    KERNEL1x1_L2 32,32,1,0  
+    KERNEL1x1_L2 32,32,2,0
+    KERNEL1x1_L2 32,32,3,0  
+    KERNEL1x1_L2 32,32,4,0
+    KERNEL1x1_L2 32,32,5,0 
+    KERNEL1x1_L2 32,32,6,0
+    KERNEL1x1_L2 32,32,7,0
+    KERNEL1x1_L2 32,32,8,0
+    KERNEL1x1_L2 32,32,9,0  
+    KERNEL1x1_L2 32,32,10,0
+    KERNEL1x1_L2 32,32,11,0  
+    KERNEL1x1_L2 32,32,12,0
+    KERNEL1x1_L2 32,32,13,0 
+    KERNEL1x1_L2 32,32,14,0
+    KERNEL1x1_L2 32,32,15,1   
+    bdnz    ZGEMM_L1x1_LOOP
+    MY_ALIGN  
+
+
+ZGEMM_L1x1_LOOP_END:
+/*----------------------------------------*/   
+    END1x1_2 
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x1_L16_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 32,32,0,0
+    KERNEL1x1_L2 32,32,1,0  
+    KERNEL1x1_L2 32,32,2,0
+    KERNEL1x1_L2 32,32,3,0  
+    KERNEL1x1_L2 32,32,4,0
+    KERNEL1x1_L2 32,32,5,0 
+    KERNEL1x1_L2 32,32,6,0
+    KERNEL1x1_E2 32,32,7,1
+    blr
+    MY_ALIGN
+
+
+ZGEMM_1x1_L8_SUB:
+/*----------------------------------------*/   
+    LOAD1x1_2
+    KERNEL1x1_L2 32,32,0,0
+    KERNEL1x1_L2 32,32,1,0  
+    KERNEL1x1_L2 32,32,2,0
+    KERNEL1x1_E2 32,32,3,1  
+    blr
+
+
+/*----------------------N1 BEGINS---------*/
+ZGEMM_L1:
+/*----------------------------------------*/   
+    andi.   T1, N,  1
+    ble   ZGEMM_L1_END
+		
+ZGEMM_L1_BEGIN:
+/*----------------------------------------*/   
+    mr    CO, C
+   
+    add     T2,C,LDC    
+    mr    AO, A  
+    add   C,  C,  T1
+#if defined(TRMMKERNEL) && defined(LEFT)   
+    mr TEMP_REG, OFFSET  /*off = offset;*/
+#endif     
+    srawi.    I,  M,  3
+    ble   ZGEMM_L1x8_END
+    dcbt    CO,r0  /*just prefetch*/
+    dcbt    T2,r0    
+
+
+ZGEMM_L1x8_BEGIN:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,8,1
+#else    
+    mr    BO, B  
+    dcbt    B,  r0  
+#endif     
+    dcbt    AO, r0
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,8,1
+    mr T1, T6
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512   
+    srawi.   T8, T1, 7 /**(T11-2) % 128x */
+#else   
+    mr T1, K
+/* TEMPS FOR PREFETCH */   
+    li T2, 1024
+    li T3, 1024+512
+    addi T1,T1, -2
+/* TEMPS FOR PREFETCH */     
+    li T4, 2048
+    li T5, 2048+512 
+    srawi.   T8, T1, 7 /**(K-2) % 128x */
+#endif   
+    ZERO1x8  
+    ble   ZGEMM_L1x8_SUB0
+    bl ZGEMM_L1x8_LMAIN_SUB
+    andi.   L,  T1, 127
+    ble   ZGEMM_L1x8_SAVE
+    b   ZGEMM_L1x8_SUB2
+
+
+ZGEMM_L1x8_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 255
+    cmpwi   T6,129
+#else   
+    andi.   L,  K,  255
+    cmpwi   K,129
+#endif       
+    li T8,1
+    bne CMP1x8_128K
+    addi BO,BO,-16
+    addi AO,AO,-128 
+    LOAD1x8O 128,16 
+    END1x8_WITHOUT_ADD   
+    LOAD1x8_2O  256, 32 
+    mtctr   T8    
+    bl ZGEMM_L1x8_K128   
+    b ZGEMM_L1x8_SAVE  
+    CMP1x8_128K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,128
+#else    
+    cmpwi   K,128
+#endif        
+    bne ZGEMM_L1x8_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-256   
+    LOAD1x8_2O 256,32
+    bl ZGEMM_L1x8_K128   
+    b ZGEMM_L1x8_SAVE 
+    MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 64
+    ble ZGEMM_L1x8_SUB2_32
+    bl ZGEMM_1x8_L64_SUB
+    MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_32:
+/*----------------------------------------*/   
+    andi.      T1,L, 32
+    ble ZGEMM_L1x8_SUB2_16    
+    bl ZGEMM_1x8_L32_SUB
+    MY_ALIGN 
+
+
+ZGEMM_L1x8_SUB2_16:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x8_SUB2_8
+    bl ZGEMM_1x8_L16_SUB  
+    MY_ALIGN    
+
+
+ZGEMM_L1x8_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x8_SUB2_4
+    LOAD1x8_2
+    KERNEL1x8_L2  256,32, 0,0
+    KERNEL1x8_L2  256,32, 1,0
+    KERNEL1x8_L2  256,32, 2,0
+    KERNEL1x8_E2  256,32, 3,1
+    MY_ALIGN   
+
+
+ZGEMM_L1x8_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x8_SUB2_2
+    LOAD1x8_2
+    KERNEL1x8_L2  256,32, 0,0
+    KERNEL1x8_E2  256,32, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L1x8_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x8_SUB2_1
+    LOAD1x8_2 
+    KERNEL1x8_E2  256,32, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L1x8_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x8_SAVE 
+    KERNEL1x8
+
+
+ZGEMM_L1x8_SAVE:
+/*----------------------------------------*/   
+    addic.    I,  I,  -1
+    SAVE1x8
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1
+#endif     
+    bgt   ZGEMM_L1x8_BEGIN
+    andi.   T2, M,  7
+    ble   ZGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L1x4_END
+    b   ZGEMM_L1x4_BEGIN
+    MY_ALIGN 
+
+
+ZGEMM_L1x8_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L1x4_BEGIN:
+/*----------------------------------------*/   
+    andi.   T2, M,  7
+    ble   ZGEMM_L1x1_END
+    andi.   T1, M,  4
+    ble   ZGEMM_L1x4_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,4,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,4,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO1x4
+    ble   ZGEMM_L1x4_SUB0 
+    bl ZGEMM_1x4_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L1x4_SAVE
+    b   ZGEMM_L1x4_SUB2
+
+
+ZGEMM_L1x4_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x4_32K
+    addi BO,BO,-16
+    addi AO,AO,-64  
+    LOAD1x4O 64,16 
+    END1x4_WITHOUT_ADD   
+    LOAD1x4_2O  128, 32 
+    mtctr   T8    
+    bl ZGEMM_L1x4_K32   
+    b ZGEMM_L1x4_SAVE  
+    CMP1x4_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L1x4_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-128   
+    LOAD1x4_2O 128,32
+    bl ZGEMM_L1x4_K32   
+    b ZGEMM_L1x4_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L1x4_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x4_SUB2_8
+    bl ZGEMM_1x4_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x4_SUB2_4
+    bl ZGEMM_1x4_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L1x4_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x4_SUB2_2
+    LOAD1x4_2
+    KERNEL1x4_L2  128,32, 0,0
+    KERNEL1x4_E2  128,32, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L1x4_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x4_SUB2_1
+    LOAD1x4_2
+    KERNEL1x4_E2  128,32, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L1x4_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x4_SAVE 
+    KERNEL1x4
+
+
+ZGEMM_L1x4_SAVE:
+/*----------------------------------------*/   
+    SAVE1x4
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1
+#endif     
+
+
+ZGEMM_L1x4_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L1x2_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  2
+    ble   ZGEMM_L1x2_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,2,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,2,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO1x2
+    ble   ZGEMM_L1x2_SUB0 
+    bl ZGEMM_1x2_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L1x2_SAVE
+    b   ZGEMM_L1x2_SUB2
+
+
+ZGEMM_L1x2_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x2_32K
+    addi BO,BO,-16
+    addi AO,AO,-32  
+    LOAD1x2O 32,16 
+    END1x2_WITHOUT_ADD   
+    LOAD1x2_2O  64, 32  
+    mtctr   T8    
+    bl ZGEMM_L1x2_K32   
+    b ZGEMM_L1x2_SAVE  
+    CMP1x2_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L1x2_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-64   
+    LOAD1x2_2O 64,32
+    bl ZGEMM_L1x2_K32   
+    b ZGEMM_L1x2_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L1x2_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x2_SUB2_8
+    bl ZGEMM_1x2_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x2_SUB2_4
+    bl ZGEMM_1x2_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L1x2_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x2_SUB2_2
+    LOAD1x2_2
+    KERNEL1x2_L2  64,32, 0,0
+    KERNEL1x2_E2  64,32, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L1x2_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x2_SUB2_1
+    LOAD1x2_2
+    KERNEL1x2_E2  64,32, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L1x2_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x2_SAVE 
+    KERNEL1x2
+
+
+ZGEMM_L1x2_SAVE:
+/*----------------------------------------*/   
+    SAVE1x2
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1
+#endif     
+
+
+ZGEMM_L1x2_END:
+/*----------------------------------------*/   
+
+
+ZGEMM_L1x1_BEGIN:
+/*----------------------------------------*/   
+    andi.   T1, M,  1
+    ble   ZGEMM_L1x1_END
+#if defined(TRMMKERNEL)   
+    REFRESH_POINTERS  AO,BO,TEMP_REG,B,1,1
+#else    
+    mr    BO, B   
+#endif        
+#if defined(TRMMKERNEL)   
+    REFRESH_TEMP_BK T6,K,TEMP_REG,1,1
+    mr T1, T6 
+    addi T1,T1, -2 
+    srawi.   T8, T1, 5 /**(T11-2) % 32x */
+#else   
+    mr T1, K 
+    addi T1,T1, -2
+    srawi.   T8, T1, 5 /**(K-2) % 32x */
+#endif     
+    ZERO1x1
+    ble   ZGEMM_L1x1_SUB0 
+    bl ZGEMM_1x1_LMAIN_SUB
+    andi.   L,  T1, 31
+    ble   ZGEMM_L1x1_SAVE
+    b   ZGEMM_L1x1_SUB2
+
+
+ZGEMM_L1x1_SUB0:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)   
+    andi.   L,  T6, 63
+    cmpwi   T6,33
+#else   
+    andi.   L,  K,  63
+    cmpwi   K,33
+#endif       
+    li T8,1
+    bne CMP1x1_32K
+    addi BO,BO,-16
+    addi AO,AO,-16  
+    LOAD1x1O 16,16 
+    END1x1_WITHOUT_ADD   
+    LOAD1x1_2O  32, 32  
+    mtctr   T8    
+    bl ZGEMM_L1x1_K32   
+    b ZGEMM_L1x1_SAVE  
+    CMP1x1_32K:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL)    
+    cmpwi   T6,32
+#else    
+    cmpwi   K,32
+#endif        
+    bne ZGEMM_L1x1_SUB2 
+    MY_ALIGN   
+    mtctr   T8
+    addi BO,BO,-32
+    addi AO,AO,-32   
+    LOAD1x1_2O 32,32
+    bl ZGEMM_L1x1_K32   
+    b ZGEMM_L1x1_SAVE 
+    MY_ALIGN 
+    MY_ALIGN 
+
+
+ZGEMM_L1x1_SUB2:
+/*----------------------------------------*/   
+    andi.      T1,L, 16
+    ble ZGEMM_L1x1_SUB2_8
+    bl ZGEMM_1x1_L16_SUB  
+    MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_8:
+/*----------------------------------------*/   
+    andi.      T1,L, 8
+    ble ZGEMM_L1x1_SUB2_4
+    bl ZGEMM_1x1_L8_SUB
+    MY_ALIGN  
+
+
+ZGEMM_L1x1_SUB2_4:
+/*----------------------------------------*/   
+    andi.      T1,L, 4
+    ble ZGEMM_L1x1_SUB2_2
+    LOAD1x1_2
+    KERNEL1x1_L2  32,32, 0,0
+    KERNEL1x1_E2  32,32, 1,1
+    MY_ALIGN
+
+
+ZGEMM_L1x1_SUB2_2:
+/*----------------------------------------*/   
+    andi.      T1,L, 2
+    ble ZGEMM_L1x1_SUB2_1
+    LOAD1x1_2
+    KERNEL1x1_E2  32,32, 0,1
+    MY_ALIGN    
+
+
+ZGEMM_L1x1_SUB2_1:
+/*----------------------------------------*/   
+    andi.      T1,L, 1
+    ble ZGEMM_L1x1_SAVE 
+    KERNEL1x1
+
+
+ZGEMM_L1x1_SAVE:
+/*----------------------------------------*/   
+    SAVE1x1
+#if defined(TRMMKERNEL)    
+    REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1
+#endif   
+
+
+ZGEMM_L1x1_END:
+/*----------------------------------------*/   
+#if defined(TRMMKERNEL) && !defined(LEFT)   
+    addi TEMP_REG, TEMP_REG, 1
+#endif   
+
+
+ZGEMM_L1_END:
+/*----------------------------------------*/   
     
\ No newline at end of file
diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S
index 8670e9574..68024b826 100644
--- a/kernel/power/zgemm_macros_power9.S
+++ b/kernel/power/zgemm_macros_power9.S
@@ -1,1825 +1,1825 @@
-/***************************************************************************
-Copyright (c) 2013-2019, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#define unit_size 16
-#define DISP32(ind,disp) (ind*unit_size*32+disp)
-#define DISP16(ind,disp) (ind*unit_size*16+disp)
-#define DISP8(ind,disp) (ind*unit_size*8+disp)
-#define DISP4(ind,disp) (ind*unit_size*4+disp)
-#define DISP2(ind,disp) (ind*unit_size*2+disp)
-#define DISP1(ind,disp) (ind*unit_size+disp)
-#define DISPX(disp)  (disp)
-/*	HELPERS FOR SAVE	*/
-/* {r0,i0} and {r1,i1} into  {r0,r1} {i0,i1} */
-
-
-.macro LOAD_COUPLE_AS_RR_II  VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET 
-#ifndef TRMMKERNEL 
-  lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
-  lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
-  xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
-  xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
-#endif	
-.endm
-/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
-
-
-.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
-	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
-	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
-.endm 
-/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
-
-
-.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
-	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
-	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
-.endm
-/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
-
-
-.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
-#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
-	xvsubdp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
-#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
-	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubdp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
-	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
-	xvsubdp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
-#else	// CC || CR || RC || RR 
-    /*we will assume {-alpha_r,-alpha_i} for this case */
-    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
-	xvsubdp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
-    /*we will negate alpha image instead  instead to fix sign*/
-	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
-#endif
-.endm 
-/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
-
-
-.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
-#ifndef TRMMKERNEL  
-	xvmsubadp \VSOUT1,\VSINII, alpha_i
-	xvmaddadp  \VSOUT2,\VSINRR, alpha_i
-#else 
-	xvmuldp \VSOUT1,\VSINII, alpha_i 
-	xvmuldp  \VSOUT2,\VSINRR, alpha_i
-#endif 
-.endm
-/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
-
-
-.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
-	xvmsubadp  \VSOUT1,\VSINRR, alpha_r
-	xvmaddadp \VSOUT2,\VSINII, alpha_r
-.endm
-/* unpack to store 2{r,r} {i,i} into  {r,i} {r,i} (big endian because of stxv) */
-
-
-.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
-	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
-	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
-.endm
-
-
-.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
-	stxv	\VSIN1,	DISPX(\LOFFSET)(\REG)
-	stxv	\VSIN2,	DISPX(\LOFFSET+16)(\REG)
-.endm
-
-
-.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
-  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
-  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
-  LOAD_COUPLE_AS_RR_II	vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
-  LOAD_COUPLE_AS_RR_II	vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
-  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes10,\VSRes12,vs12,vs13 
-  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
-  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes14,\VSRes16,\VSRes3,\VSRes4
-  MULT_APLHA_PART1	vs6,vs8,vs16,vs17
-  MULT_APLHA_PART2  vs2,vs4,vs14,vs15 
-  AGGREGATE_REALS_IMAGES	vs10,vs11,vs12,vs13
-  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
-  AGGREGATE_REALS_IMAGES	\VSRes1,\VSRes2,\VSRes3,\VSRes4	
-  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
-  MULT_APLHA_PART1	vs10,vs12, vs24,vs25
-  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5 
-  MULT_APLHA_PART1	\VSRes1,\VSRes3, vs26,vs27
-  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
-  MULT_APLHA_PART2	vs10,vs12,vs24,vs25
-  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5 
-  MULT_APLHA_PART2	\VSRes1,\VSRes3, vs26,vs27
-  UNPACK_FOR_STORE	vs24,vs25,vs10,vs12
-  UNPACK_FOR_STORE	vs26,vs27,\VSRes1,\VSRes3
-  STORE_COUPLE	\BASE_REG,(\LOFFSET +64),vs10,vs12
-  STORE_COUPLE	\BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
-.endm
-
-
-.macro SAVE4  VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
-  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
-  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
-  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
-  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
-  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
-  MULT_APLHA_PART1	vs6,vs8, vs16,vs17
-  MULT_APLHA_PART2	vs2,vs4, vs14,vs15 
-  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
-  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
-  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5
-  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
-  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5
-.endm
-
-
-
-.macro SAVE2  VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
-  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5	
-  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
-  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
-  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
-  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9	
-  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9  
-.endm
-
-
-
-.macro SAVE1  VSRes1,VSRes2,BASE_REG,LOFFSET
-  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
-#ifndef TRMMKERNEL 
-  lxv	vs18,	(\LOFFSET)(\BASE_REG) 
-  xxmrgld  vs14,vs18,vs18
-  xxmrghd  vs15,vs18,vs18	
-#endif	
-  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs4,vs5	
-  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
-  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
-  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
-  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9 
-  xxmrghd  vs7,vs15,vs14	
-  stxv	vs7,	(\LOFFSET)(\BASE_REG) 
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=8
-**********************************************************************************************/
-
-.macro Zero2x8
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs42,	vs42,	vs42
-	xxlxor	vs43,	vs43,	vs43
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-	xxlxor	vs46,	vs46,	vs46
-	xxlxor	vs47,	vs47,	vs47
-	xxlxor	vs48,	vs48,	vs48
-	xxlxor	vs49,	vs49,	vs49
-	xxlxor	vs50,	vs50,	vs50
-	xxlxor	vs51,	vs51,	vs51
-	xxlxor	vs52,	vs52,	vs52
-	xxlxor	vs53,	vs53,	vs53
-	xxlxor	vs54,	vs54,	vs54
-	xxlxor	vs55,	vs55,	vs55
-	xxlxor	vs56,	vs56,	vs56
-	xxlxor	vs57,	vs57,	vs57
-	xxlxor	vs58,	vs58,	vs58
-	xxlxor	vs59,	vs59,	vs59
-	xxlxor	vs60,	vs60,	vs60
-	xxlxor	vs61,	vs61,	vs61
-	xxlxor	vs62,	vs62,	vs62
-	xxlxor	vs63,	vs63,	vs63
-.endm
-
-
-.macro LOAD2x8   
-	LOAD2x8O 0,0 
-.endm
-
-
-.macro LOAD2x8O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
- 
-.endm
-
-
-.macro END2x8_NORMAL
-	END2x8 AO,BO,128,32
-.endm
-
-
-.macro END2x8_WITHOUT_ADD
-	END2x8 AO,BO,0,0
-.endm
-
-
-.macro END2x8	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs48,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs49,	vs0,	vs19
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs50,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs51,	vs1,	vs19
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs52,	vs2,	vs18
-	xvmaddadp	vs37,	vs2,	vs17
-	xvmaddadp	vs53,	vs2,	vs19
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs54,	vs3,	vs18
-	xvmaddadp	vs39,	vs3,	vs17
-	xvmaddadp	vs55,	vs3,	vs19
-	xvmaddadp	vs40,	vs4,	vs16
-	xvmaddadp	vs56,	vs4,	vs18
-	xvmaddadp	vs41,	vs4,	vs17
-	xvmaddadp	vs57,	vs4,	vs19
-	xvmaddadp	vs42,	vs5,	vs16
-	xvmaddadp	vs58,	vs5,	vs18
-	xvmaddadp	vs43,	vs5,	vs17
-	xvmaddadp	vs59,	vs5,	vs19
-	xvmaddadp	vs44,	vs6,	vs16
-	xvmaddadp	vs60,	vs6,	vs18
-	xvmaddadp	vs45,	vs6,	vs17
-	xvmaddadp	vs61,	vs6,	vs19
-	xvmaddadp	vs46,	vs7,	vs16
-	xvmaddadp	vs62,	vs7,	vs18
-	xvmaddadp	vs47,	vs7,	vs17
-	xvmaddadp	vs63,	vs7,	vs19
-.endm
-
-
-.macro LOAD2x8_2
-    LOAD2x8_2O 0,0
-.endm	
-
-
-.macro LOAD2x8_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
-	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
-	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
-.endm	
-
-
-.macro END2x8_2	  
-  /*for load2 offset will be 256 and 64*/
-   KERNEL2x8_2	AO,BO,	256,64,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL2x8_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x8_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs48,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs49,	vs0,	vs19
-  xxswapd	vs21, vs20
-  xxswapd	vs23, vs22
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs50,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs51,	vs1,	vs19
-.if \Complete==0	
-	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs52,	vs2,	vs18
-	xvmaddadp	vs37,	vs2,	vs17
-	xvmaddadp	vs53,	vs2,	vs19
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs54,	vs3,	vs18
-	xvmaddadp	vs39,	vs3,	vs17
-	xvmaddadp	vs55,	vs3,	vs19
-.if \Complete==0	
-	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs40,	vs4,	vs16
-	xvmaddadp	vs56,	vs4,	vs18
-	xvmaddadp	vs41,	vs4,	vs17
-	xvmaddadp	vs57,	vs4,	vs19
-	xvmaddadp	vs42,	vs5,	vs16
-	xvmaddadp	vs58,	vs5,	vs18
-	xvmaddadp	vs43,	vs5,	vs17
-	xvmaddadp	vs59,	vs5,	vs19
-.if \Complete==0		
-	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs44,	vs6,	vs16
-	xvmaddadp	vs60,	vs6,	vs18
-	xvmaddadp	vs45,	vs6,	vs17
-	xvmaddadp	vs61,	vs6,	vs19
-	xvmaddadp	vs46,	vs7,	vs16
-	xvmaddadp	vs62,	vs7,	vs18
-	xvmaddadp	vs47,	vs7,	vs17
-	xvmaddadp	vs63,	vs7,	vs19	
-.if \Complete==0		
-	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs48,	vs8,	vs22
-.if \Complete==0
-	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
-.endif	
-	xvmaddadp	vs33,	vs8,	vs21
-	xvmaddadp	vs49,	vs8,	vs23
-.if \Complete==0		
-  xxswapd	vs17, vs16
-  xxswapd	vs19, vs18
-.endif
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs50,	vs9,	vs22
-	xvmaddadp	vs35,	vs9,	vs21
-	xvmaddadp	vs51,	vs9,	vs23
-.if \Complete==0		
-	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
-	xvmaddadp	vs36,	vs10,	vs20
-	xvmaddadp	vs52,	vs10,	vs22
-	xvmaddadp	vs37,	vs10,	vs21
-	xvmaddadp	vs53,	vs10,	vs23
-	xvmaddadp	vs38,	vs11,	vs20
-	xvmaddadp	vs54,	vs11,	vs22
-	xvmaddadp	vs39,	vs11,	vs21
-	xvmaddadp	vs55,	vs11,	vs23
-.if \Complete==0	
-	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs40,	vs12,	vs20
-	xvmaddadp	vs56,	vs12,	vs22
-	xvmaddadp	vs41,	vs12,	vs21
-	xvmaddadp	vs57,	vs12,	vs23
-	xvmaddadp	vs42,	vs13,	vs20
-	xvmaddadp	vs58,	vs13,	vs22
-	xvmaddadp	vs43,	vs13,	vs21
-	xvmaddadp	vs59,	vs13,	vs23
-.if \Complete==0	
-	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs44,	vs14,	vs20
-	xvmaddadp	vs60,	vs14,	vs22
-	xvmaddadp	vs45,	vs14,	vs21
-	xvmaddadp	vs61,	vs14,	vs23
-	xvmaddadp	vs46,	vs15,	vs20
-	xvmaddadp	vs62,	vs15,	vs22
-	xvmaddadp	vs47,	vs15,	vs21
-	xvmaddadp	vs63,	vs15,	vs23
-.if \Complete==0	
-	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
- 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP16(\Index,256)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif
-.endif 
-.endm
-
- 
-
-
-
-.macro KERNEL2x8
-  LOAD2x8
-  END2x8  AO, BO, 128,32
-.endm
-
-
-.macro SAVE2x8
-	add	T1, CO ,LDC 
-	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
-	SAVE8  vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0  
-	addi	CO, CO, 128
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=4
-**********************************************************************************************/
-
-
-.macro Zero2x4
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs42,	vs42,	vs42
-	xxlxor	vs43,	vs43,	vs43
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-	xxlxor	vs46,	vs46,	vs46
-	xxlxor	vs47,	vs47,	vs47
-.endm
-
-
-.macro LOAD2x4   
-	LOAD2x4O 0,0 
-.endm
-
-
-.macro LOAD2x4O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A  
-.endm
-
-
-.macro END2x4_NORMAL
-	END2x4 AO,BO,64,32
-.endm
-
-
-.macro END2x4_WITHOUT_ADD
-	END2x4 AO,BO,0,0
-.endm
-
-
-.macro END2x4	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs40,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs41,	vs0,	vs19
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs42,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs43,	vs1,	vs19
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs44,	vs2,	vs18
-	xvmaddadp	vs37,	vs2,	vs17
-	xvmaddadp	vs45,	vs2,	vs19
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs46,	vs3,	vs18
-	xvmaddadp	vs39,	vs3,	vs17
-	xvmaddadp	vs47,	vs3,	vs19
-
-.endm
-
-
-.macro LOAD2x4_2
-    LOAD2x4_2O 0,0
-.endm	
-
-
-.macro LOAD2x4_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
-	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
-	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
-.endm	
-
-
-.macro END2x4_2	  
-  /*for load2 offset will be 128 and 64*/
-   KERNEL2x4_2	AO,BO,	128,64,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL2x4_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x4_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs40,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs41,	vs0,	vs19
-  xxswapd	vs21, vs20
-  xxswapd	vs23, vs22
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs42,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs43,	vs1,	vs19
-.if \Complete==0	
-	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs44,	vs2,	vs18
-	xvmaddadp	vs37,	vs2,	vs17
-	xvmaddadp	vs45,	vs2,	vs19
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs46,	vs3,	vs18
-	xvmaddadp	vs39,	vs3,	vs17
-	xvmaddadp	vs47,	vs3,	vs19
-.if \Complete==0	
-	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
- 
-.if \Complete==0		
-	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs40,	vs8,	vs22 
-	xvmaddadp	vs33,	vs8,	vs21
-	xvmaddadp	vs41,	vs8,	vs23
-.if \Complete==0		
-  xxswapd	vs17, vs16
-  xxswapd	vs19, vs18
-.endif
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs42,	vs9,	vs22
-	xvmaddadp	vs35,	vs9,	vs21
-	xvmaddadp	vs43,	vs9,	vs23
-.if \Complete==0		
-	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
-	xvmaddadp	vs36,	vs10,	vs20
-	xvmaddadp	vs44,	vs10,	vs22
-	xvmaddadp	vs37,	vs10,	vs21
-	xvmaddadp	vs45,	vs10,	vs23
-	xvmaddadp	vs38,	vs11,	vs20
-	xvmaddadp	vs46,	vs11,	vs22
-	xvmaddadp	vs39,	vs11,	vs21
-	xvmaddadp	vs47,	vs11,	vs23
-.if \Complete==0	
-	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
- 
-.if \Complete==0	 
- 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP8(\Index,128)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL2x4
-  LOAD2x4
-  END2x4  AO, BO, 64,32
-.endm
-
-
-
-.macro SAVE2x4 
-	add	T1, CO ,LDC 
-	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
-	SAVE4  vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0  
-	addi	CO, CO, 64
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=2
-**********************************************************************************************/
-
-
-.macro Zero2x2
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-
-.endm
-
-
-.macro LOAD2x2   
-	LOAD2x2O 0,0 
-.endm
-
-
-.macro LOAD2x2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
- 
-.endm
-
-
-.macro END2x2_NORMAL
-	END2x2 AO,BO,32,32
-.endm
-
-
-.macro END2x2_WITHOUT_ADD
-	END2x2 AO,BO,0,0
-.endm
-
-
-.macro END2x2	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs36,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs37,	vs0,	vs19
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs38,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs39,	vs1,	vs19 
-
-.endm
-
-
-.macro LOAD2x2_2
-    LOAD2x2_2O 0,0
-.endm	
-
-
-.macro LOAD2x2_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
-	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
-	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
-	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
- 	
-.endm	
-
-
-.macro END2x2_2	  
-  /*for load2 offset will be 64 and 64*/
-   KERNEL2x2_2	AO,BO,	64,64,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL2x2_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x2_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs36,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs37,	vs0,	vs19
-  xxswapd	vs21, vs20
-  xxswapd	vs23, vs22
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs38,	vs1,	vs18
-	xvmaddadp	vs35,	vs1,	vs17
-	xvmaddadp	vs39,	vs1,	vs19
-.if \Complete==0	
-	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	 
-.if \Complete==0		
-	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs36,	vs8,	vs22 
-	xvmaddadp	vs33,	vs8,	vs21
-	xvmaddadp	vs37,	vs8,	vs23
-.if \Complete==0		
-  xxswapd	vs17, vs16
-  xxswapd	vs19, vs18
-.endif
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs38,	vs9,	vs22
-	xvmaddadp	vs35,	vs9,	vs21
-	xvmaddadp	vs39,	vs9,	vs23
-.if \Complete==0	 
- 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
-.endif
-.if \Complete==0		
-	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
- 
- 
-
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP4(\Index,64)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL2x2
-  LOAD2x2
-  END2x2  AO, BO, 32,32
-.endm
-
-
-
-.macro SAVE2x2 
-	add	T1, CO ,LDC 
-	SAVE2  vs32,vs33,vs34,vs35,CO,0
-	SAVE2  vs36,vs37,vs38,vs39,T1,0 
-	addi	CO, CO, 32 
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=1
-**********************************************************************************************/
-
-
-
-.macro Zero2x1
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
- 
-.endm
-
-
-.macro LOAD2x1   
-	LOAD2x1O 0,0 
-.endm
-
-
-.macro LOAD2x1O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
-.endm
-
-
-.macro END2x1_NORMAL
-	END2x1 AO,BO,16,32
-.endm
-
-
-.macro END2x1_WITHOUT_ADD
-	END2x1 AO,BO,0,0
-.endm
-
-
-.macro END2x1	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs34,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs35,	vs0,	vs19 
-.endm
-
-
-.macro LOAD2x1_2
-    LOAD2x1_2O 0,0
-.endm	
-
-
-.macro LOAD2x1_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
-	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
-	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
-	xxswapd	vs17, vs16
-	xxswapd	vs19, vs18
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
-.endm	
-
-
-.macro END2x1_2	  
-  /*for load2 offset will be 32 and 64*/
-   KERNEL2x1_2	AO,BO,	32,64,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL2x1_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL2x1_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-  xxswapd	vs21, vs20
-  xxswapd	vs23, vs22 
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs34,	vs0,	vs18
-	xvmaddadp	vs33,	vs0,	vs17
-	xvmaddadp	vs35,	vs0,	vs19
-.if \Complete==0	
-	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
-.endif	 
-.if \Complete==0		
-	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
-.endif
-.if \Complete==0		
-  xxswapd	vs17, vs16
-  xxswapd	vs19, vs18
-.endif 
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs34,	vs8,	vs22 
-	xvmaddadp	vs33,	vs8,	vs21
-	xvmaddadp	vs35,	vs8,	vs23
-.if \Complete==0		
-	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
-.endif
- 
-.if \Complete==0	 
- 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
-	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP2(\Index,32)
-	addi	\BREG, \BREG,  DISP4(\Index,64)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL2x1
-  LOAD2x1
-  END2x1  AO, BO, 16,32
-.endm
-
-
-
-.macro SAVE2x1
-	add	T1, CO ,LDC 
-	SAVE1  vs32,vs33,CO,0
-	SAVE1  vs34,vs35,T1,0  
-	addi	CO, CO, 16 
-.endm
-
-/**********************************************************************************************
-*
-
-.macros for N=1 and M=8
-**********************************************************************************************/
-
-
-.macro Zero1x8
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-	xxlxor	vs40,	vs40,	vs40
-	xxlxor	vs41,	vs41,	vs41
-	xxlxor	vs42,	vs42,	vs42
-	xxlxor	vs43,	vs43,	vs43
-	xxlxor	vs44,	vs44,	vs44
-	xxlxor	vs45,	vs45,	vs45
-	xxlxor	vs46,	vs46,	vs46
-	xxlxor	vs47,	vs47,	vs47
-	xxlxor	vs48,	vs48,	vs48
-.endm
-
-
-.macro LOAD1x8   
-	LOAD1x8O 0,0 
-.endm
-
-
-.macro LOAD1x8O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B 
-	xxswapd	vs17, vs16 
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
- 
-.endm
-
-
-.macro END1x8_NORMAL
-	END1x8 AO,BO,128,16
-.endm
-
-
-.macro END1x8_WITHOUT_ADD
-	END1x8 AO,BO,0,0
-.endm
-
-
-.macro END1x8	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs37,	vs2,	vs17
-
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs39,	vs3,	vs17
-
-	xvmaddadp	vs40,	vs4,	vs16
-	xvmaddadp	vs41,	vs4,	vs17
-
-	xvmaddadp	vs42,	vs5,	vs16
-	xvmaddadp	vs43,	vs5,	vs17
-
-	xvmaddadp	vs44,	vs6,	vs16
-	xvmaddadp	vs45,	vs6,	vs17
-
-	xvmaddadp	vs46,	vs7,	vs16
-	xvmaddadp	vs47,	vs7,	vs17
-
-.endm
-
-
-.macro LOAD1x8_2
-    LOAD1x8_2O 0,0
-.endm	
-
-
-.macro LOAD1x8_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
-.endm	
-
-
-.macro END1x8_2	  
-  /*for load2 offset will be 256 and 32*/
-   KERNEL1x8_2	AO,BO,	256,32,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL1x8_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x8_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-  xxswapd	vs21, vs20
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-.if \Complete==0	
-	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs37,	vs2,	vs17
-
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs39,	vs3,	vs17
-.if \Complete==0	
-	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs40,	vs4,	vs16
-	xvmaddadp	vs41,	vs4,	vs17
-
-	xvmaddadp	vs42,	vs5,	vs16
-	xvmaddadp	vs43,	vs5,	vs17
-.if \Complete==0		
-	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs44,	vs6,	vs16
-	xvmaddadp	vs45,	vs6,	vs17
-
-	xvmaddadp	vs46,	vs7,	vs16
-	xvmaddadp	vs47,	vs7,	vs17
-.if \Complete==0		
-	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-.endif
-.if \Complete==0		
-  xxswapd	vs17, vs16
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21
-.if \Complete==0
-	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
-.endif	
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs35,	vs9,	vs21
-.if \Complete==0		
-	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
-	xvmaddadp	vs36,	vs10,	vs20
-	xvmaddadp	vs37,	vs10,	vs21
-	xvmaddadp	vs38,	vs11,	vs20
-	xvmaddadp	vs39,	vs11,	vs21
-.if \Complete==0	
-	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs40,	vs12,	vs20
-	xvmaddadp	vs41,	vs12,	vs21
-	xvmaddadp	vs42,	vs13,	vs20
-	xvmaddadp	vs43,	vs13,	vs21
-.if \Complete==0	
-	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs44,	vs14,	vs20
-	xvmaddadp	vs45,	vs14,	vs21
-	xvmaddadp	vs46,	vs15,	vs20
-	xvmaddadp	vs47,	vs15,	vs21
-.if \Complete==0	
-	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
- 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP16(\Index,256)
-	addi	\BREG, \BREG,  DISP2(\Index,32)
-.endif
-.endif 
-.endm
-
- 
-
-
-
-.macro KERNEL1x8
-  LOAD1x8
-  END1x8  AO, BO, 128,16
-.endm
-
-
-.macro SAVE1x8
-	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
-	addi	CO, CO, 128
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=4
-**********************************************************************************************/
-
-
-.macro Zero1x4
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35
-	xxlxor	vs36,	vs36,	vs36
-	xxlxor	vs37,	vs37,	vs37
-	xxlxor	vs38,	vs38,	vs38
-	xxlxor	vs39,	vs39,	vs39
-.endm
-
-
-.macro LOAD1x4   
-	LOAD1x4O 0,0 
-.endm
-
-
-.macro LOAD1x4O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A 
- 
-.endm
-
-
-.macro END1x4_NORMAL
-	END1x4 AO,BO,64,16
-.endm
-
-
-.macro END1x4_WITHOUT_ADD
-	END1x4 AO,BO,0,0
-.endm
-
-
-.macro END1x4	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs37,	vs2,	vs17
-
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs39,	vs3,	vs17
-
-.endm
-
-
-.macro LOAD1x4_2
-    LOAD1x4_2O 0,0
-.endm	
-
-
-.macro LOAD1x4_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
-.endm	
-
-
-.macro END1x4_2	  
-  /*for load2 offset will be 128 and 32*/
-   KERNEL1x4_2	AO,BO,	128,32,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL1x4_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x4_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-  xxswapd	vs21, vs20
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-.if \Complete==0	
-	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
-	xvmaddadp	vs36,	vs2,	vs16
-	xvmaddadp	vs37,	vs2,	vs17
-
-	xvmaddadp	vs38,	vs3,	vs16
-	xvmaddadp	vs39,	vs3,	vs17
-.if \Complete==0	
-	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
- 
-.if \Complete==0		
-	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21
-.if \Complete==0		
-  xxswapd	vs17, vs16
-.endif
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs35,	vs9,	vs21
-.if \Complete==0		
-	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
-	xvmaddadp	vs36,	vs10,	vs20
-	xvmaddadp	vs37,	vs10,	vs21
-	xvmaddadp	vs38,	vs11,	vs20
-	xvmaddadp	vs39,	vs11,	vs21
-.if \Complete==0	
-	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	
- 
-.if \Complete==0	 
- 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP8(\Index,128)
-	addi	\BREG, \BREG,  DISP2(\Index,32)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL1x4
-  LOAD1x4
-  END1x4  AO, BO, 64,16
-.endm
-
-
-
-.macro SAVE1x4 
-	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
-	addi	CO, CO, 64
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=2
-**********************************************************************************************/
-
-
-.macro Zero1x2
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33
-	xxlxor	vs34,	vs34,	vs34
-	xxlxor	vs35,	vs35,	vs35 
-
-.endm
-
-
-.macro LOAD1x2   
-	LOAD1x2O 0,0 
-.endm
-
-
-.macro LOAD1x2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
-
-.endm
-
-
-.macro END1x2_NORMAL
-	END1x2 AO,BO,32,16
-.endm
-
-
-.macro END1x2_WITHOUT_ADD
-	END1x2 AO,BO,0,0
-.endm
-
-
-.macro END1x2	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-
-.endm
-
-
-.macro LOAD1x2_2
-    LOAD1x2_2O 0,0
-.endm	
-
-
-.macro LOAD1x2_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
-.endm	
-
-
-.macro END1x2_2	  
-  /*for load2 offset will be 64 and 32*/
-   KERNEL1x2_2	AO,BO,	64,32,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL1x2_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x2_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-	xvmaddadp	vs32,	vs0,	vs16
-	xvmaddadp	vs33,	vs0,	vs17
-  xxswapd	vs21, vs20
-	xvmaddadp	vs34,	vs1,	vs16
-	xvmaddadp	vs35,	vs1,	vs17
-.if \Complete==0	
-	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif	 
-.if \Complete==0		
-	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-.endif
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21
-.if \Complete==0		
-  xxswapd	vs17, vs16
-.endif
-	xvmaddadp	vs34,	vs9,	vs20
-	xvmaddadp	vs35,	vs9,	vs21
-.if \Complete==0	 
- 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
-.endif
-.if \Complete==0		
-	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
-	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
-.endif
- 
- 
-
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP4(\Index,64)
-	addi	\BREG, \BREG,  DISP2(\Index,32)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL1x2
-  LOAD1x2
-  END1x2  AO, BO, 32,16
-.endm
-
-
-
-.macro SAVE1x2 
-	SAVE2  vs32,vs33,vs34,vs35,CO,0
-	addi	CO, CO, 32 
-.endm
-/**********************************************************************************************
-*
-
-.macros for N=2 and M=1
-**********************************************************************************************/
-
-
-
-.macro Zero1x1
-	xxlxor	vs32,	vs32,	vs32
-	xxlxor	vs33,	vs33,	vs33 
-.endm
-
-
-.macro LOAD1x1   
-	LOAD1x1O 0,0 
-.endm
-
-
-.macro LOAD1x1O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
-	xxswapd	vs17, vs16
-
-.endm
-
-
-.macro END1x1_NORMAL
-	END1x1 AO,BO,16,16
-.endm
-
-
-.macro END1x1_WITHOUT_ADD
-	END1x1 AO,BO,0,0
-.endm
-
-
-.macro END1x1	AREG, BREG, OffsetA, OffsetB
-.if \OffsetB != 0
-	addi	\BREG, \BREG, \OffsetB
-.endif
-.if \OffsetA != 0
-	addi	\AREG, \AREG, \OffsetA
-.endif
-	xvmaddadp	vs32,	vs0,	vs16 
-	xvmaddadp	vs33,	vs0,	vs17 
-.endm
-
-
-.macro LOAD1x1_2
-    LOAD1x1_2O 0,0
-.endm	
-
-
-.macro LOAD1x1_2O  OffsetA,OffsetB
-	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
-	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
-	xxswapd	vs17, vs16
-
-	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
-	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
-.endm	
-
-
-.macro END1x1_2	  
-  /*for load2 offset will be 32 and 32*/
-   KERNEL1x1_2	AO,BO,	32,32,0 ,1,1 
-.endm
- 
-
-
-.macro KERNEL1x1_E2	OffsetA,OffsetB, Index,IsLast 
-  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
-.endm
-
-
-.macro KERNEL1x1_L2	OffsetA,OffsetB, Index,IsLast
-  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
-.endm
-
-
-.macro KERNEL1x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
-  xxswapd	vs21, vs20
-	xvmaddadp	vs32,	vs0,	vs16 
-	xvmaddadp	vs33,	vs0,	vs17 
-.if \Complete==0	
-	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
-.endif	 
-.if \Complete==0		
-	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
-.endif
-.if \Complete==0		
-  xxswapd	vs17, vs16
-.endif 
-	xvmaddadp	vs32,	vs8,	vs20
-	xvmaddadp	vs33,	vs8,	vs21 
-.if \Complete==0		
-	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
-.endif
- 
-.if \Complete==0	 
- 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
-.endif
-.if \IsLast==1
-.if \Complete==1
-	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
-	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
-.else
-	addi	\AREG, \AREG, DISP2(\Index,32)
-	addi	\BREG, \BREG,  DISP2(\Index,32)
-.endif
-.endif 
-.endm
- 
-
-
-.macro KERNEL1x1
-  LOAD1x1
-  END1x1  AO, BO, 16,16
-.endm
-
-
-
-.macro SAVE1x1
-	SAVE1  vs32,vs33,CO,0
-	addi	CO, CO, 16 
-.endm
-
-/****************************TRMM POINTER REFRESH
-
-.macroSES*************************/
-
-
-.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
-		.if \SHIFT_VAL==16 
-			slwi		\REG1,	\REG2,	8			
-		.elseif \SHIFT_VAL==8  
-			slwi		\REG1,	\REG2,	7			 
-		.elseif \SHIFT_VAL==4
-			slwi		\REG1,	\REG2,	6			  
-		.elseif \SHIFT_VAL==2
-			slwi		\REG1,	\REG2,	5			 
-		.elseif \SHIFT_VAL==1
-			slwi		\REG1,	\REG2,	4			 
-		.endif
-.endm
-/*
-//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		ptrbb = bb;
-// #else
-// 		ptrba += off*16;
-// 		ptrbb = bb + off*2;
-// #endif
-*/
-
-
-.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
-    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
-        /* ptrbb = bb;*/
-        mr \PTR_B,\B_VAL     /* refresh BPOINT */
-    #else
-		    /*
-        // ptrba  =ptrba+ off*C_A;
-        // ptrbb = bb + off*C_B; 
-				*/
-		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
-		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
-		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
-		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
-    #endif 
-.endm
-
-/*
-// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-// 		temp = bk-off;
-// #elif defined(LEFT)
-// 		temp = off+16;	// number of values in A
-// #else
-// 		temp = off+2;	// number of values in B
-// #endif
-*/
-
-
-.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
-    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
-                            /* temp = bk-off;*/
-           sub \TEMP_BK,\BK_VAL,\OFF_VAL
-    #elif defined(LEFT)
-                            /* temp = off+INCR_A;	// number of values in A */
-           addi \TEMP_BK, \OFF_VAL, \INCR_A
-    #else
-                            /* temp = off+INCR_B	// number of values in B*/
-           addi \TEMP_BK,\OFF_VAL, \INCR_B
-    #endif
-.endm
-/*
-// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-// 		temp = bk - off;
-// #ifdef LEFT
-// 		temp -= 16; // number of values in A
-// #else
-// 		temp -= 2; // number of values in B
-// #endif
-// 		ptrba += temp*16;
-// 		ptrbb += temp*2;
-// #endif
-// #ifdef LEFT
-// 		off += 16; // number of values in A
-// #endif
-*/
- 
-
-
-.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
-    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-                    /*temp = bk - off;*/
-                sub \TEMP_BK,\BK_VAL,\OFF_VAL
-    #ifdef LEFT
-                    /*temp -= 8; // number of values in A*/
-                addi \TEMP_BK,\TEMP_BK,-\C_A
-    #else
-                    /*temp -= 4; // number of values in B*/
-                addi \TEMP_BK,\TEMP_BK,-\C_B 
-    #endif
-                    /*ptrba += temp*C_A;
-                    ptrbb += temp*C_B;*/ 
-                SHIFT_REG T4,\TEMP_BK,\C_A
-								SHIFT_REG T2,\TEMP_BK,\C_B
-                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
-								add \PTR_B, \PTR_B,T2 
-    #endif
-    #ifdef LEFT
-                    /*off += 8; // number of values in A*/
-                 addi \OFF_VAL,\OFF_VAL,\C_A
-    #endif
+/***************************************************************************
+Copyright (c) 2013-2019, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define unit_size 16
+#define DISP32(ind,disp) (ind*unit_size*32+disp)
+#define DISP16(ind,disp) (ind*unit_size*16+disp)
+#define DISP8(ind,disp) (ind*unit_size*8+disp)
+#define DISP4(ind,disp) (ind*unit_size*4+disp)
+#define DISP2(ind,disp) (ind*unit_size*2+disp)
+#define DISP1(ind,disp) (ind*unit_size+disp)
+#define DISPX(disp)  (disp)
+/*	HELPERS FOR SAVE	*/
+/* {r0,i0} and {r1,i1} into  {r0,r1} {i0,i1} */
+
+
+.macro LOAD_COUPLE_AS_RR_II  VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET 
+#ifndef TRMMKERNEL 
+  lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
+  lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
+  xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
+  xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
+#endif	
+.endm
+/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
+
+
+.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
+	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
+	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
+.endm 
+/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
+
+
+.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
+	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
+	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
+.endm
+/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
+
+
+.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
+#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
+	xvsubdp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
+#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
+	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubdp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
+	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
+	xvsubdp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
+#else	// CC || CR || RC || RR 
+    /*we will assume {-alpha_r,-alpha_i} for this case */
+    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
+	xvsubdp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
+    /*we will negate alpha image instead  instead to fix sign*/
+	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
+#endif
+.endm 
+/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
+
+
+.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
+#ifndef TRMMKERNEL  
+	xvmsubadp \VSOUT1,\VSINII, alpha_i
+	xvmaddadp  \VSOUT2,\VSINRR, alpha_i
+#else 
+	xvmuldp \VSOUT1,\VSINII, alpha_i 
+	xvmuldp  \VSOUT2,\VSINRR, alpha_i
+#endif 
+.endm
+/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
+
+
+.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
+	xvmsubadp  \VSOUT1,\VSINRR, alpha_r
+	xvmaddadp \VSOUT2,\VSINII, alpha_r
+.endm
+/* unpack to store 2{r,r} {i,i} into  {r,i} {r,i} (big endian because of stxv) */
+
+
+.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
+	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
+	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
+.endm
+
+
+.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
+	stxv	\VSIN1,	DISPX(\LOFFSET)(\REG)
+	stxv	\VSIN2,	DISPX(\LOFFSET+16)(\REG)
+.endm
+
+
+.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
+  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+  LOAD_COUPLE_AS_RR_II	vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64)
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
+  LOAD_COUPLE_AS_RR_II	vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes10,\VSRes12,vs12,vs13 
+  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes14,\VSRes16,\VSRes3,\VSRes4
+  MULT_APLHA_PART1	vs6,vs8,vs16,vs17
+  MULT_APLHA_PART2  vs2,vs4,vs14,vs15 
+  AGGREGATE_REALS_IMAGES	vs10,vs11,vs12,vs13
+  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
+  AGGREGATE_REALS_IMAGES	\VSRes1,\VSRes2,\VSRes3,\VSRes4	
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
+  MULT_APLHA_PART1	vs10,vs12, vs24,vs25
+  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5 
+  MULT_APLHA_PART1	\VSRes1,\VSRes3, vs26,vs27
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
+  MULT_APLHA_PART2	vs10,vs12,vs24,vs25
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5 
+  MULT_APLHA_PART2	\VSRes1,\VSRes3, vs26,vs27
+  UNPACK_FOR_STORE	vs24,vs25,vs10,vs12
+  UNPACK_FOR_STORE	vs26,vs27,\VSRes1,\VSRes3
+  STORE_COUPLE	\BASE_REG,(\LOFFSET +64),vs10,vs12
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
+.endm
+
+
+.macro SAVE4  VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5
+  LOAD_COUPLE_AS_RR_II	vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32)
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs8,vs9 
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
+  AGGREGATE_REALS_IMAGES	vs6,vs7,vs8,vs9  
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15
+  MULT_APLHA_PART1	vs6,vs8, vs16,vs17
+  MULT_APLHA_PART2	vs2,vs4, vs14,vs15 
+  MULT_APLHA_PART2	vs6,vs8,vs16,vs17
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9
+  UNPACK_FOR_STORE	vs16,vs17,vs3,vs5
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9
+  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs3,vs5
+.endm
+
+
+
+.macro SAVE2  VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3
+  LOAD_COUPLE_AS_RR_II	vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs4,vs5	
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
+  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9	
+  STORE_COUPLE	\BASE_REG,\LOFFSET,vs7,vs9  
+.endm
+
+
+
+.macro SAVE1  VSRes1,VSRes2,BASE_REG,LOFFSET
+  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3
+#ifndef TRMMKERNEL 
+  lxv	vs18,	(\LOFFSET)(\BASE_REG) 
+  xxmrgld  vs14,vs18,vs18
+  xxmrghd  vs15,vs18,vs18	
+#endif	
+  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs4,vs5	
+  AGGREGATE_REALS_IMAGES	vs2,vs3,vs4,vs5	
+  MULT_APLHA_PART1	vs2,vs4, vs14,vs15	
+  MULT_APLHA_PART2	vs2,vs4, vs14,vs15  
+  UNPACK_FOR_STORE	vs14,vs15,vs7,vs9 
+  xxmrghd  vs7,vs15,vs14	
+  stxv	vs7,	(\LOFFSET)(\BASE_REG) 
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=8
+**********************************************************************************************/
+
+.macro Zero2x8
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
+	xxlxor	vs48,	vs48,	vs48
+	xxlxor	vs49,	vs49,	vs49
+	xxlxor	vs50,	vs50,	vs50
+	xxlxor	vs51,	vs51,	vs51
+	xxlxor	vs52,	vs52,	vs52
+	xxlxor	vs53,	vs53,	vs53
+	xxlxor	vs54,	vs54,	vs54
+	xxlxor	vs55,	vs55,	vs55
+	xxlxor	vs56,	vs56,	vs56
+	xxlxor	vs57,	vs57,	vs57
+	xxlxor	vs58,	vs58,	vs58
+	xxlxor	vs59,	vs59,	vs59
+	xxlxor	vs60,	vs60,	vs60
+	xxlxor	vs61,	vs61,	vs61
+	xxlxor	vs62,	vs62,	vs62
+	xxlxor	vs63,	vs63,	vs63
+.endm
+
+
+.macro LOAD2x8   
+	LOAD2x8O 0,0 
+.endm
+
+
+.macro LOAD2x8O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+ 
+.endm
+
+
+.macro END2x8_NORMAL
+	END2x8 AO,BO,128,32
+.endm
+
+
+.macro END2x8_WITHOUT_ADD
+	END2x8 AO,BO,0,0
+.endm
+
+
+.macro END2x8	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs48,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs49,	vs0,	vs19
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs50,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs51,	vs1,	vs19
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs52,	vs2,	vs18
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs53,	vs2,	vs19
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs54,	vs3,	vs18
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs55,	vs3,	vs19
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs56,	vs4,	vs18
+	xvmaddadp	vs41,	vs4,	vs17
+	xvmaddadp	vs57,	vs4,	vs19
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs58,	vs5,	vs18
+	xvmaddadp	vs43,	vs5,	vs17
+	xvmaddadp	vs59,	vs5,	vs19
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs60,	vs6,	vs18
+	xvmaddadp	vs45,	vs6,	vs17
+	xvmaddadp	vs61,	vs6,	vs19
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs62,	vs7,	vs18
+	xvmaddadp	vs47,	vs7,	vs17
+	xvmaddadp	vs63,	vs7,	vs19
+.endm
+
+
+.macro LOAD2x8_2
+    LOAD2x8_2O 0,0
+.endm	
+
+
+.macro LOAD2x8_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
+.endm	
+
+
+.macro END2x8_2	  
+  /*for load2 offset will be 256 and 64*/
+   KERNEL2x8_2	AO,BO,	256,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL2x8_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x8_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs48,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs49,	vs0,	vs19
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs50,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs51,	vs1,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs52,	vs2,	vs18
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs53,	vs2,	vs19
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs54,	vs3,	vs18
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs55,	vs3,	vs19
+.if \Complete==0	
+	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs56,	vs4,	vs18
+	xvmaddadp	vs41,	vs4,	vs17
+	xvmaddadp	vs57,	vs4,	vs19
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs58,	vs5,	vs18
+	xvmaddadp	vs43,	vs5,	vs17
+	xvmaddadp	vs59,	vs5,	vs19
+.if \Complete==0		
+	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs60,	vs6,	vs18
+	xvmaddadp	vs45,	vs6,	vs17
+	xvmaddadp	vs61,	vs6,	vs19
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs62,	vs7,	vs18
+	xvmaddadp	vs47,	vs7,	vs17
+	xvmaddadp	vs63,	vs7,	vs19	
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs48,	vs8,	vs22
+.if \Complete==0
+	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
+.endif	
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs49,	vs8,	vs23
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs50,	vs9,	vs22
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs51,	vs9,	vs23
+.if \Complete==0		
+	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs52,	vs10,	vs22
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs53,	vs10,	vs23
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs54,	vs11,	vs22
+	xvmaddadp	vs39,	vs11,	vs21
+	xvmaddadp	vs55,	vs11,	vs23
+.if \Complete==0	
+	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs40,	vs12,	vs20
+	xvmaddadp	vs56,	vs12,	vs22
+	xvmaddadp	vs41,	vs12,	vs21
+	xvmaddadp	vs57,	vs12,	vs23
+	xvmaddadp	vs42,	vs13,	vs20
+	xvmaddadp	vs58,	vs13,	vs22
+	xvmaddadp	vs43,	vs13,	vs21
+	xvmaddadp	vs59,	vs13,	vs23
+.if \Complete==0	
+	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs44,	vs14,	vs20
+	xvmaddadp	vs60,	vs14,	vs22
+	xvmaddadp	vs45,	vs14,	vs21
+	xvmaddadp	vs61,	vs14,	vs23
+	xvmaddadp	vs46,	vs15,	vs20
+	xvmaddadp	vs62,	vs15,	vs22
+	xvmaddadp	vs47,	vs15,	vs21
+	xvmaddadp	vs63,	vs15,	vs23
+.if \Complete==0	
+	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP16(\Index,256)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+
+ 
+
+
+
+.macro KERNEL2x8
+  LOAD2x8
+  END2x8  AO, BO, 128,32
+.endm
+
+
+.macro SAVE2x8
+	add	T1, CO ,LDC 
+	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+	SAVE8  vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0  
+	addi	CO, CO, 128
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+
+.macro Zero2x4
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
+.endm
+
+
+.macro LOAD2x4   
+	LOAD2x4O 0,0 
+.endm
+
+
+.macro LOAD2x4O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A  
+.endm
+
+
+.macro END2x4_NORMAL
+	END2x4 AO,BO,64,32
+.endm
+
+
+.macro END2x4_WITHOUT_ADD
+	END2x4 AO,BO,0,0
+.endm
+
+
+.macro END2x4	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs40,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs41,	vs0,	vs19
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs42,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs43,	vs1,	vs19
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs44,	vs2,	vs18
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs45,	vs2,	vs19
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs46,	vs3,	vs18
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs47,	vs3,	vs19
+
+.endm
+
+
+.macro LOAD2x4_2
+    LOAD2x4_2O 0,0
+.endm	
+
+
+.macro LOAD2x4_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END2x4_2	  
+  /*for load2 offset will be 128 and 64*/
+   KERNEL2x4_2	AO,BO,	128,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL2x4_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x4_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs40,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs41,	vs0,	vs19
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs42,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs43,	vs1,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs44,	vs2,	vs18
+	xvmaddadp	vs37,	vs2,	vs17
+	xvmaddadp	vs45,	vs2,	vs19
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs46,	vs3,	vs18
+	xvmaddadp	vs39,	vs3,	vs17
+	xvmaddadp	vs47,	vs3,	vs19
+.if \Complete==0	
+	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs40,	vs8,	vs22 
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs41,	vs8,	vs23
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs42,	vs9,	vs22
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs43,	vs9,	vs23
+.if \Complete==0		
+	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs44,	vs10,	vs22
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs45,	vs10,	vs23
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs46,	vs11,	vs22
+	xvmaddadp	vs39,	vs11,	vs21
+	xvmaddadp	vs47,	vs11,	vs23
+.if \Complete==0	
+	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP8(\Index,128)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL2x4
+  LOAD2x4
+  END2x4  AO, BO, 64,32
+.endm
+
+
+
+.macro SAVE2x4 
+	add	T1, CO ,LDC 
+	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+	SAVE4  vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0  
+	addi	CO, CO, 64
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+
+.macro Zero2x2
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+
+.endm
+
+
+.macro LOAD2x2   
+	LOAD2x2O 0,0 
+.endm
+
+
+.macro LOAD2x2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+ 
+.endm
+
+
+.macro END2x2_NORMAL
+	END2x2 AO,BO,32,32
+.endm
+
+
+.macro END2x2_WITHOUT_ADD
+	END2x2 AO,BO,0,0
+.endm
+
+
+.macro END2x2	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs36,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs37,	vs0,	vs19
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs38,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs39,	vs1,	vs19 
+
+.endm
+
+
+.macro LOAD2x2_2
+    LOAD2x2_2O 0,0
+.endm	
+
+
+.macro LOAD2x2_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
+	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
+ 	
+.endm	
+
+
+.macro END2x2_2	  
+  /*for load2 offset will be 64 and 64*/
+   KERNEL2x2_2	AO,BO,	64,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL2x2_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x2_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs36,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs37,	vs0,	vs19
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs38,	vs1,	vs18
+	xvmaddadp	vs35,	vs1,	vs17
+	xvmaddadp	vs39,	vs1,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs36,	vs8,	vs22 
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs37,	vs8,	vs23
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs38,	vs9,	vs22
+	xvmaddadp	vs35,	vs9,	vs21
+	xvmaddadp	vs39,	vs9,	vs23
+.if \Complete==0	 
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \Complete==0		
+	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+ 
+ 
+
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP4(\Index,64)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL2x2
+  LOAD2x2
+  END2x2  AO, BO, 32,32
+.endm
+
+
+
+.macro SAVE2x2 
+	add	T1, CO ,LDC 
+	SAVE2  vs32,vs33,vs34,vs35,CO,0
+	SAVE2  vs36,vs37,vs38,vs39,T1,0 
+	addi	CO, CO, 32 
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+
+
+.macro Zero2x1
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+ 
+.endm
+
+
+.macro LOAD2x1   
+	LOAD2x1O 0,0 
+.endm
+
+
+.macro LOAD2x1O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B 
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
+.endm
+
+
+.macro END2x1_NORMAL
+	END2x1 AO,BO,16,32
+.endm
+
+
+.macro END2x1_WITHOUT_ADD
+	END2x1 AO,BO,0,0
+.endm
+
+
+.macro END2x1	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs34,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs35,	vs0,	vs19 
+.endm
+
+
+.macro LOAD2x1_2
+    LOAD2x1_2O 0,0
+.endm	
+
+
+.macro LOAD2x1_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs18,	(\OffsetB+16)(BO)	// load real,imag from B
+	lxv	vs20,	(\OffsetB+32)(BO)	// load real,imag	from B
+	lxv	vs22,	(\OffsetB+48)(BO)	// load real,imag  from B	
+	xxswapd	vs17, vs16
+	xxswapd	vs19, vs18
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END2x1_2	  
+  /*for load2 offset will be 32 and 64*/
+   KERNEL2x1_2	AO,BO,	32,64,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL2x1_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL2x1_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xxswapd	vs21, vs20
+  xxswapd	vs23, vs22 
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs34,	vs0,	vs18
+	xvmaddadp	vs33,	vs0,	vs17
+	xvmaddadp	vs35,	vs0,	vs19
+.if \Complete==0	
+	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+	lxv	vs18,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs17, vs16
+  xxswapd	vs19, vs18
+.endif 
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs34,	vs8,	vs22 
+	xvmaddadp	vs33,	vs8,	vs21
+	xvmaddadp	vs35,	vs8,	vs23
+.if \Complete==0		
+	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
+.endif
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
+	lxv	vs22,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP2(\Index,32)
+	addi	\BREG, \BREG,  DISP4(\Index,64)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL2x1
+  LOAD2x1
+  END2x1  AO, BO, 16,32
+.endm
+
+
+
+.macro SAVE2x1
+	add	T1, CO ,LDC 
+	SAVE1  vs32,vs33,CO,0
+	SAVE1  vs34,vs35,T1,0  
+	addi	CO, CO, 16 
+.endm
+
+/**********************************************************************************************
+*
+
+.macros for N=1 and M=8
+**********************************************************************************************/
+
+
+.macro Zero1x8
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+	xxlxor	vs40,	vs40,	vs40
+	xxlxor	vs41,	vs41,	vs41
+	xxlxor	vs42,	vs42,	vs42
+	xxlxor	vs43,	vs43,	vs43
+	xxlxor	vs44,	vs44,	vs44
+	xxlxor	vs45,	vs45,	vs45
+	xxlxor	vs46,	vs46,	vs46
+	xxlxor	vs47,	vs47,	vs47
+	xxlxor	vs48,	vs48,	vs48
+.endm
+
+
+.macro LOAD1x8   
+	LOAD1x8O 0,0 
+.endm
+
+
+.macro LOAD1x8O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B 
+	xxswapd	vs17, vs16 
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+ 
+.endm
+
+
+.macro END1x8_NORMAL
+	END1x8 AO,BO,128,16
+.endm
+
+
+.macro END1x8_WITHOUT_ADD
+	END1x8 AO,BO,0,0
+.endm
+
+
+.macro END1x8	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs41,	vs4,	vs17
+
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs43,	vs5,	vs17
+
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs45,	vs6,	vs17
+
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs47,	vs7,	vs17
+
+.endm
+
+
+.macro LOAD1x8_2
+    LOAD1x8_2O 0,0
+.endm	
+
+
+.macro LOAD1x8_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs4,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs5,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs6,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs7,	(112+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(128+0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(128+16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,   (128+32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,   (128+48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs12,   (128+64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs13,   (128+80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs14,   (128+96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs15,   (128+112+\OffsetA)(AO)	// load real,imag from A
+.endm	
+
+
+.macro END1x8_2	  
+  /*for load2 offset will be 256 and 32*/
+   KERNEL1x8_2	AO,BO,	256,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x8_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x8_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x8_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x8_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+  xxswapd	vs21, vs20
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+.if \Complete==0	
+	lxv	vs0,	DISP16(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP16(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+.if \Complete==0	
+	lxv	vs2,	DISP16(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP16(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs40,	vs4,	vs16
+	xvmaddadp	vs41,	vs4,	vs17
+
+	xvmaddadp	vs42,	vs5,	vs16
+	xvmaddadp	vs43,	vs5,	vs17
+.if \Complete==0		
+	lxv	vs4,	DISP16(\Index,64+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs5,	DISP16(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs44,	vs6,	vs16
+	xvmaddadp	vs45,	vs6,	vs17
+
+	xvmaddadp	vs46,	vs7,	vs16
+	xvmaddadp	vs47,	vs7,	vs17
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+.if \Complete==0
+	lxv	vs6,	DISP16(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs7,	DISP16(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A	
+.endif	
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21
+.if \Complete==0		
+	lxv	vs8,	DISP16(\Index,128+ + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP16(\Index,128+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs39,	vs11,	vs21
+.if \Complete==0	
+	lxv	vs10,	DISP16(\Index,128+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP16(\Index,128+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs40,	vs12,	vs20
+	xvmaddadp	vs41,	vs12,	vs21
+	xvmaddadp	vs42,	vs13,	vs20
+	xvmaddadp	vs43,	vs13,	vs21
+.if \Complete==0	
+	lxv	vs12,	DISP16(\Index, 192 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs13,	DISP16(\Index,192 +16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs44,	vs14,	vs20
+	xvmaddadp	vs45,	vs14,	vs21
+	xvmaddadp	vs46,	vs15,	vs20
+	xvmaddadp	vs47,	vs15,	vs21
+.if \Complete==0	
+	lxv	vs14,	DISP16(\Index,192 +32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs15,	DISP16(\Index,192 +48 + \OffsetA)(\AREG)	// load real,imag from A
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP16(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP16(\Index,256)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
+.endm
+
+ 
+
+
+
+.macro KERNEL1x8
+  LOAD1x8
+  END1x8  AO, BO, 128,16
+.endm
+
+
+.macro SAVE1x8
+	SAVE8  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0
+	addi	CO, CO, 128
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=4
+**********************************************************************************************/
+
+
+.macro Zero1x4
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35
+	xxlxor	vs36,	vs36,	vs36
+	xxlxor	vs37,	vs37,	vs37
+	xxlxor	vs38,	vs38,	vs38
+	xxlxor	vs39,	vs39,	vs39
+.endm
+
+
+.macro LOAD1x4   
+	LOAD1x4O 0,0 
+.endm
+
+
+.macro LOAD1x4O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A 
+ 
+.endm
+
+
+.macro END1x4_NORMAL
+	END1x4 AO,BO,64,16
+.endm
+
+
+.macro END1x4_WITHOUT_ADD
+	END1x4 AO,BO,0,0
+.endm
+
+
+.macro END1x4	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+
+.endm
+
+
+.macro LOAD1x4_2
+    LOAD1x4_2O 0,0
+.endm	
+
+
+.macro LOAD1x4_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs2,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs3,	(48+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(64+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(80+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs10,	(96+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs11,	(112+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END1x4_2	  
+  /*for load2 offset will be 128 and 32*/
+   KERNEL1x4_2	AO,BO,	128,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x4_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x4_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x4_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x4_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+  xxswapd	vs21, vs20
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+.if \Complete==0	
+	lxv	vs0,	DISP8(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP8(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+	xvmaddadp	vs36,	vs2,	vs16
+	xvmaddadp	vs37,	vs2,	vs17
+
+	xvmaddadp	vs38,	vs3,	vs16
+	xvmaddadp	vs39,	vs3,	vs17
+.if \Complete==0	
+	lxv	vs2,	DISP8(\Index,32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs3,	DISP8(\Index,48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21
+.if \Complete==0		
+	lxv	vs8,	DISP8(\Index,64+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP8(\Index,64+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+	xvmaddadp	vs36,	vs10,	vs20
+	xvmaddadp	vs37,	vs10,	vs21
+	xvmaddadp	vs38,	vs11,	vs20
+	xvmaddadp	vs39,	vs11,	vs21
+.if \Complete==0	
+	lxv	vs10,	DISP8(\Index,64+32 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs11,	DISP8(\Index,64+48 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP8(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP8(\Index,128)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL1x4
+  LOAD1x4
+  END1x4  AO, BO, 64,16
+.endm
+
+
+
+.macro SAVE1x4 
+	SAVE4  vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0
+	addi	CO, CO, 64
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=2
+**********************************************************************************************/
+
+
+.macro Zero1x2
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33
+	xxlxor	vs34,	vs34,	vs34
+	xxlxor	vs35,	vs35,	vs35 
+
+.endm
+
+
+.macro LOAD1x2   
+	LOAD1x2O 0,0 
+.endm
+
+
+.macro LOAD1x2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A 
+
+.endm
+
+
+.macro END1x2_NORMAL
+	END1x2 AO,BO,32,16
+.endm
+
+
+.macro END1x2_WITHOUT_ADD
+	END1x2 AO,BO,0,0
+.endm
+
+
+.macro END1x2	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+
+.endm
+
+
+.macro LOAD1x2_2
+    LOAD1x2_2O 0,0
+.endm	
+
+
+.macro LOAD1x2_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs1,	(16+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(32+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs9,	(48+\OffsetA)(AO)	// load real,imag from A
+.endm	
+
+
+.macro END1x2_2	  
+  /*for load2 offset will be 64 and 32*/
+   KERNEL1x2_2	AO,BO,	64,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x2_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x2_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x2_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x2_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+	xvmaddadp	vs32,	vs0,	vs16
+	xvmaddadp	vs33,	vs0,	vs17
+  xxswapd	vs21, vs20
+	xvmaddadp	vs34,	vs1,	vs16
+	xvmaddadp	vs35,	vs1,	vs17
+.if \Complete==0	
+	lxv	vs0,	DISP4(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs1,	DISP4(\Index,16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif
+	xvmaddadp	vs34,	vs9,	vs20
+	xvmaddadp	vs35,	vs9,	vs21
+.if \Complete==0	 
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \Complete==0		
+	lxv	vs8,	DISP4(\Index,32+0+ \OffsetA)(\AREG)	// load real,imag from A
+	lxv	vs9,	DISP4(\Index,32+16 + \OffsetA)(\AREG)	// load real,imag from A
+.endif
+ 
+ 
+
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP4(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP4(\Index,64)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL1x2
+  LOAD1x2
+  END1x2  AO, BO, 32,16
+.endm
+
+
+
+.macro SAVE1x2 
+	SAVE2  vs32,vs33,vs34,vs35,CO,0
+	addi	CO, CO, 32 
+.endm
+/**********************************************************************************************
+*
+
+.macros for N=2 and M=1
+**********************************************************************************************/
+
+
+
+.macro Zero1x1
+	xxlxor	vs32,	vs32,	vs32
+	xxlxor	vs33,	vs33,	vs33 
+.endm
+
+
+.macro LOAD1x1   
+	LOAD1x1O 0,0 
+.endm
+
+
+.macro LOAD1x1O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A 
+	xxswapd	vs17, vs16
+
+.endm
+
+
+.macro END1x1_NORMAL
+	END1x1 AO,BO,16,16
+.endm
+
+
+.macro END1x1_WITHOUT_ADD
+	END1x1 AO,BO,0,0
+.endm
+
+
+.macro END1x1	AREG, BREG, OffsetA, OffsetB
+.if \OffsetB != 0
+	addi	\BREG, \BREG, \OffsetB
+.endif
+.if \OffsetA != 0
+	addi	\AREG, \AREG, \OffsetA
+.endif
+	xvmaddadp	vs32,	vs0,	vs16 
+	xvmaddadp	vs33,	vs0,	vs17 
+.endm
+
+
+.macro LOAD1x1_2
+    LOAD1x1_2O 0,0
+.endm	
+
+
+.macro LOAD1x1_2O  OffsetA,OffsetB
+	lxv	vs16,(\OffsetB+	0)(BO)	// load real imag from B
+	lxv	vs20,	(\OffsetB+16)(BO)	// load real,imag	from B
+	xxswapd	vs17, vs16
+
+	lxv	vs0,	(0+\OffsetA)(AO)	// load real,imag from A
+	lxv	vs8,	(16+\OffsetA)(AO)	// load real,imag from A 
+.endm	
+
+
+.macro END1x1_2	  
+  /*for load2 offset will be 32 and 32*/
+   KERNEL1x1_2	AO,BO,	32,32,0 ,1,1 
+.endm
+ 
+
+
+.macro KERNEL1x1_E2	OffsetA,OffsetB, Index,IsLast 
+  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
+.endm
+
+
+.macro KERNEL1x1_L2	OffsetA,OffsetB, Index,IsLast
+  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
+.endm
+
+
+.macro KERNEL1x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
+  xxswapd	vs21, vs20
+	xvmaddadp	vs32,	vs0,	vs16 
+	xvmaddadp	vs33,	vs0,	vs17 
+.if \Complete==0	
+	lxv	vs0,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
+.endif	 
+.if \Complete==0		
+	lxv	vs16,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
+.endif
+.if \Complete==0		
+  xxswapd	vs17, vs16
+.endif 
+	xvmaddadp	vs32,	vs8,	vs20
+	xvmaddadp	vs33,	vs8,	vs21 
+.if \Complete==0		
+	lxv	vs8,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
+.endif
+ 
+.if \Complete==0	 
+ 	lxv	vs20,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
+.endif
+.if \IsLast==1
+.if \Complete==1
+	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
+	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
+.else
+	addi	\AREG, \AREG, DISP2(\Index,32)
+	addi	\BREG, \BREG,  DISP2(\Index,32)
+.endif
+.endif 
+.endm
+ 
+
+
+.macro KERNEL1x1
+  LOAD1x1
+  END1x1  AO, BO, 16,16
+.endm
+
+
+
+.macro SAVE1x1
+	SAVE1  vs32,vs33,CO,0
+	addi	CO, CO, 16 
+.endm
+
+/****************************TRMM POINTER REFRESH
+
+.macroSES*************************/
+
+
+.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
+		.if \SHIFT_VAL==16 
+			slwi		\REG1,	\REG2,	8			
+		.elseif \SHIFT_VAL==8  
+			slwi		\REG1,	\REG2,	7			 
+		.elseif \SHIFT_VAL==4
+			slwi		\REG1,	\REG2,	6			  
+		.elseif \SHIFT_VAL==2
+			slwi		\REG1,	\REG2,	5			 
+		.elseif \SHIFT_VAL==1
+			slwi		\REG1,	\REG2,	4			 
+		.endif
+.endm
+/*
+//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		ptrbb = bb;
+// #else
+// 		ptrba += off*16;
+// 		ptrbb = bb + off*2;
+// #endif
+*/
+
+
+.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
+    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
+        /* ptrbb = bb;*/
+        mr \PTR_B,\B_VAL     /* refresh BPOINT */
+    #else
+		    /*
+        // ptrba  =ptrba+ off*C_A;
+        // ptrbb = bb + off*C_B; 
+				*/
+		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
+		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
+		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
+		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
+    #endif 
+.endm
+
+/*
+// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+// 		temp = bk-off;
+// #elif defined(LEFT)
+// 		temp = off+16;	// number of values in A
+// #else
+// 		temp = off+2;	// number of values in B
+// #endif
+*/
+
+
+.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
+    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
+                            /* temp = bk-off;*/
+           sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #elif defined(LEFT)
+                            /* temp = off+INCR_A;	// number of values in A */
+           addi \TEMP_BK, \OFF_VAL, \INCR_A
+    #else
+                            /* temp = off+INCR_B	// number of values in B*/
+           addi \TEMP_BK,\OFF_VAL, \INCR_B
+    #endif
+.endm
+/*
+// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+// 		temp = bk - off;
+// #ifdef LEFT
+// 		temp -= 16; // number of values in A
+// #else
+// 		temp -= 2; // number of values in B
+// #endif
+// 		ptrba += temp*16;
+// 		ptrbb += temp*2;
+// #endif
+// #ifdef LEFT
+// 		off += 16; // number of values in A
+// #endif
+*/
+ 
+
+
+.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
+    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+                    /*temp = bk - off;*/
+                sub \TEMP_BK,\BK_VAL,\OFF_VAL
+    #ifdef LEFT
+                    /*temp -= 8; // number of values in A*/
+                addi \TEMP_BK,\TEMP_BK,-\C_A
+    #else
+                    /*temp -= 4; // number of values in B*/
+                addi \TEMP_BK,\TEMP_BK,-\C_B 
+    #endif
+                    /*ptrba += temp*C_A;
+                    ptrbb += temp*C_B;*/ 
+                SHIFT_REG T4,\TEMP_BK,\C_A
+								SHIFT_REG T2,\TEMP_BK,\C_B
+                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
+								add \PTR_B, \PTR_B,T2 
+    #endif
+    #ifdef LEFT
+                    /*off += 8; // number of values in A*/
+                 addi \OFF_VAL,\OFF_VAL,\C_A
+    #endif
 .endm
\ No newline at end of file
diff --git a/kernel/x86_64/sgemm_kernel_16x4_haswell.S b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
index ef156fd27..76ea12fee 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_haswell.S
+++ b/kernel/x86_64/sgemm_kernel_16x4_haswell.S
@@ -1,6806 +1,6806 @@
-/*********************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************************/
-
-/*********************************************************************
-* 2014/07/28 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-* 2013/10/28 Saar
-* Parameter:
-*	SGEMM_DEFAULT_UNROLL_N	4
-*	SGEMM_DEFAULT_UNROLL_M	16
-*	SGEMM_DEFAULT_P		768
-*	SGEMM_DEFAULT_Q		384
-*	A_PR1			512
-*	B_PR1			512
-*	
-* 
-* 2014/07/28 Saar
-* Performance at 9216x9216x9216:
-*       1 thread:      102 GFLOPS       (SANDYBRIDGE:  59)      (MKL:   83)
-*       2 threads:     195 GFLOPS       (SANDYBRIDGE: 116)      (MKL:  155)
-*       3 threads:     281 GFLOPS       (SANDYBRIDGE: 165)      (MKL:  230)
-*       4 threads:     366 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  267)
-*
-*********************************************************************/
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define BO2	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define	CO2	%rdx
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 256
-
-#define OLD_A		40 + STACKSIZE(%rsp)
-#define OLD_B		48 + STACKSIZE(%rsp)
-#define OLD_C		56 + STACKSIZE(%rsp)
-#define OLD_LDC		64 + STACKSIZE(%rsp)
-#define OLD_OFFSET	72 + STACKSIZE(%rsp)
-
-#endif
-
-#if defined(OS_WINDOWS)
-#define L_BUFFER_SIZE 8192
-#else
-#define L_BUFFER_SIZE 12288
-#endif
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA	 48(%rsp)
-#define OFFSET	 56(%rsp)
-#define KK	 64(%rsp)
-#define KKK	 72(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-#if defined(BULLDOZER)
-
-#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
-
-#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0
-
-#else
-
-#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0
-
-#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0
-
-#endif
-
-
-#define	A_PR1	512
-#define	B_PR1	512
-
-/*******************************************************************************************
-* 6 lines of N
-*******************************************************************************************/
-
-.macro KERNEL16x6_SUB
-	vmovups 	-16 * SIZE(AO), %ymm0
-	vmovups 	 -8 * SIZE(AO), %ymm1
-	vbroadcastss	 -4 * SIZE(BO), %ymm2
-	vbroadcastss	 -3 * SIZE(BO), %ymm3
-	prefetcht0	A_PR1(AO)
-
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
-
-	vbroadcastss	 -2 * SIZE(BO), %ymm2
-	vbroadcastss	 -1 * SIZE(BO), %ymm3
-	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
-	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )
-
-	vbroadcastss	  0 * SIZE(BO), %ymm2
-	vbroadcastss	  1 * SIZE(BO), %ymm3
-	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm13,%ymm2,%ymm1  )
-	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm15,%ymm3,%ymm1 )
-
-	addq	$ 6*SIZE, BO 
-	addq	$ 16*SIZE, AO 
-	decq	%rax 
-.endm
-
-.macro SAVE16x6
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm7 , %ymm7
-	vmulps	%ymm0 , %ymm8 , %ymm8
-	vmulps	%ymm0 , %ymm9 , %ymm9
-	vmulps	%ymm0 , %ymm10, %ymm10
-	vmulps	%ymm0 , %ymm11, %ymm11
-	vmulps	%ymm0 , %ymm12, %ymm12
-	vmulps	%ymm0 , %ymm13, %ymm13
-	vmulps	%ymm0 , %ymm14, %ymm14
-	vmulps	%ymm0 , %ymm15, %ymm15
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
-
-	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
-	vaddps  8 * SIZE(CO1, LDC,2), %ymm9,%ymm9
-
-	vaddps 	        (CO2), %ymm10,%ymm10
-	vaddps  8 * SIZE(CO2), %ymm11,%ymm11
-
-	vaddps 	        (CO2, LDC), %ymm12,%ymm12
-	vaddps  8 * SIZE(CO2, LDC), %ymm13,%ymm13
-
-	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
-	vaddps  8 * SIZE(CO2, LDC,2), %ymm15,%ymm15
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
-
-	vmovups	%ymm8 ,  	(CO1, LDC,2)
-	vmovups	%ymm9 , 8 * SIZE(CO1, LDC,2)
-
-	vmovups	%ymm10,  	(CO2)
-	vmovups	%ymm11, 8 * SIZE(CO2)
-
-	vmovups	%ymm12,  	(CO2, LDC)
-	vmovups	%ymm13, 8 * SIZE(CO2, LDC)
-
-	vmovups	%ymm14,  	(CO2, LDC,2)
-	vmovups	%ymm15, 8 * SIZE(CO2, LDC,2)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x6_SUB
-	vmovups 	-16 * SIZE(AO), %ymm0
-	vbroadcastss	 -4 * SIZE(BO), %ymm2
-	vbroadcastss	 -3 * SIZE(BO), %ymm3
-
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-
-	vbroadcastss	 -2 * SIZE(BO), %ymm2
-	vbroadcastss	 -1 * SIZE(BO), %ymm3
-	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
-
-	vbroadcastss	  0 * SIZE(BO), %ymm2
-	vbroadcastss	  1 * SIZE(BO), %ymm3
-	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
-
-	addq	$ 6*SIZE, BO 
-	addq	$ 8*SIZE, AO 
-	decq	%rax 
-.endm
-
-.macro SAVE8x6
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm8 , %ymm8
-	vmulps	%ymm0 , %ymm10, %ymm10
-	vmulps	%ymm0 , %ymm12, %ymm12
-	vmulps	%ymm0 , %ymm14, %ymm14
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
-	vaddps 	        (CO2), %ymm10,%ymm10
-	vaddps 	        (CO2, LDC), %ymm12,%ymm12
-	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm8 ,  	(CO1, LDC,2)
-	vmovups	%ymm10,  	(CO2)
-	vmovups	%ymm12,  	(CO2, LDC)
-	vmovups	%ymm14,  	(CO2, LDC,2)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x6_SUB
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vbroadcastss	 -4 * SIZE(BO), %xmm2
-	vbroadcastss	 -3 * SIZE(BO), %xmm3
-
-	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
-
-	vbroadcastss	 -2 * SIZE(BO), %xmm2
-	vbroadcastss	 -1 * SIZE(BO), %xmm3
-	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
-
-	vbroadcastss	  0 * SIZE(BO), %xmm2
-	vbroadcastss	  1 * SIZE(BO), %xmm3
-	VFMADD231PS_(  	%xmm12,%xmm2,%xmm0  )
-	VFMADD231PS_(  	%xmm14,%xmm3,%xmm0 )
-
-	addq	$ 6*SIZE, BO 
-	addq	$ 4*SIZE, AO 
-	decq	%rax 
-.endm
-
-.macro SAVE4x6
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-	vmulps	%xmm0 , %xmm6 , %xmm6
-	vmulps	%xmm0 , %xmm8 , %xmm8
-	vmulps	%xmm0 , %xmm10, %xmm10
-	vmulps	%xmm0 , %xmm12, %xmm12
-	vmulps	%xmm0 , %xmm14, %xmm14
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-	vaddps 	        (CO1, LDC,2), %xmm8,%xmm8
-	vaddps 	        (CO2), %xmm10,%xmm10
-	vaddps 	        (CO2, LDC), %xmm12,%xmm12
-	vaddps 	        (CO2, LDC,2), %xmm14,%xmm14
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-	vmovups	%xmm8 ,  	(CO1, LDC,2)
-	vmovups	%xmm10,  	(CO2)
-	vmovups	%xmm12,  	(CO2, LDC)
-	vmovups	%xmm14,  	(CO2, LDC,2)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x6_SUB
-	vmovss 	-16 * SIZE(AO), %xmm0
-	vmovss 	-15 * SIZE(AO), %xmm1
-	vmovss	 -4 * SIZE(BO), %xmm2
-	vmovss	 -3 * SIZE(BO), %xmm3
-
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
-
-	vmovss	 -2 * SIZE(BO), %xmm2
-	vmovss	 -1 * SIZE(BO), %xmm3
-	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
-	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
-
-	vmovss	  0 * SIZE(BO), %xmm2
-	vmovss	  1 * SIZE(BO), %xmm3
-	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm13,%xmm2,%xmm1  )
-	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm15,%xmm3,%xmm1 )
-
-	addq	$ 6*SIZE, BO 
-	addq	$ 2*SIZE, AO 
-	decq	%rax 
-.endm
-
-.macro SAVE2x6
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm7 , %xmm7
-	vmulss	%xmm0 , %xmm8 , %xmm8
-	vmulss	%xmm0 , %xmm9 , %xmm9
-	vmulss	%xmm0 , %xmm10, %xmm10
-	vmulss	%xmm0 , %xmm11, %xmm11
-	vmulss	%xmm0 , %xmm12, %xmm12
-	vmulss	%xmm0 , %xmm13, %xmm13
-	vmulss	%xmm0 , %xmm14, %xmm14
-	vmulss	%xmm0 , %xmm15, %xmm15
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
-
-	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
-	vaddss  1 * SIZE(CO1, LDC,2), %xmm9,%xmm9
-
-	vaddss 	        (CO2), %xmm10,%xmm10
-	vaddss  1 * SIZE(CO2), %xmm11,%xmm11
-
-	vaddss 	        (CO2, LDC), %xmm12,%xmm12
-	vaddss  1 * SIZE(CO2, LDC), %xmm13,%xmm13
-
-	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
-	vaddss  1 * SIZE(CO2, LDC,2), %xmm15,%xmm15
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
-
-	vmovss	%xmm8 ,  	(CO1, LDC,2)
-	vmovss	%xmm9 , 1 * SIZE(CO1, LDC,2)
-
-	vmovss	%xmm10,  	(CO2)
-	vmovss	%xmm11, 1 * SIZE(CO2)
-
-	vmovss	%xmm12,  	(CO2, LDC)
-	vmovss	%xmm13, 1 * SIZE(CO2, LDC)
-
-	vmovss	%xmm14,  	(CO2, LDC,2)
-	vmovss	%xmm15, 1 * SIZE(CO2, LDC,2)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x6_SUB
-	vmovss 	-16 * SIZE(AO), %xmm0
-	vmovss	 -4 * SIZE(BO), %xmm2
-	vmovss	 -3 * SIZE(BO), %xmm3
-
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-
-	vmovss	 -2 * SIZE(BO), %xmm2
-	vmovss	 -1 * SIZE(BO), %xmm3
-	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
-
-	vmovss	  0 * SIZE(BO), %xmm2
-	vmovss	  1 * SIZE(BO), %xmm3
-	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
-
-	addq	$ 6*SIZE, BO 
-	addq	$ 1*SIZE, AO 
-	decq	%rax 
-.endm
-
-.macro SAVE1x6
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm8 , %xmm8
-	vmulss	%xmm0 , %xmm10, %xmm10
-	vmulss	%xmm0 , %xmm12, %xmm12
-	vmulss	%xmm0 , %xmm14, %xmm14
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
-	vaddss 	        (CO2), %xmm10,%xmm10
-	vaddss 	        (CO2, LDC), %xmm12,%xmm12
-	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm8 ,  	(CO1, LDC,2)
-	vmovss	%xmm10,  	(CO2)
-	vmovss	%xmm12,  	(CO2, LDC)
-	vmovss	%xmm14,  	(CO2, LDC,2)
-
-.endm
-
-
-/*******************************************************************************************/
-
-
-/*******************************************************************************************
-* 4 lines of N
-*******************************************************************************************/
-
-.macro KERNEL16x4_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
-	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )
-	addq	$ 4 , BI	
-	addq	$ 16, %rax 
-.endm
-
-.macro SAVE16x4
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm7 , %ymm7
-	vmulps	%ymm0 , %ymm8 , %ymm8
-	vmulps	%ymm0 , %ymm9 , %ymm9
-	vmulps	%ymm0 , %ymm10, %ymm10
-	vmulps	%ymm0 , %ymm11, %ymm11
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
-
-	vaddps 	        (CO2), %ymm8,%ymm8
-	vaddps  8 * SIZE(CO2), %ymm9,%ymm9
-
-	vaddps 	        (CO2, LDC), %ymm10,%ymm10
-	vaddps  8 * SIZE(CO2, LDC), %ymm11,%ymm11
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
-
-	vmovups	%ymm8 ,  	(CO2)
-	vmovups	%ymm9 , 8 * SIZE(CO2)
-
-	vmovups	%ymm10,  	(CO2, LDC)
-	vmovups	%ymm11, 8 * SIZE(CO2, LDC)
-
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1, LDC)
-	prefetcht0	64(CO2)
-	prefetcht0	64(CO2, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x4_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
-	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
-	addq	$ 4 , BI	
-	addq	$ 8 , %rax 
-.endm
-
-.macro SAVE8x4
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm8 , %ymm8
-	vmulps	%ymm0 , %ymm10, %ymm10
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps 	        (CO2), %ymm8,%ymm8
-	vaddps 	        (CO2, LDC), %ymm10,%ymm10
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm8 ,  	(CO2)
-	vmovups	%ymm10,  	(CO2, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x4_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
-	addq	$ 4 , BI	
-	addq	$ 4 , %rax 
-.endm
-
-.macro SAVE4x4
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-	vmulps	%xmm0 , %xmm6 , %xmm6
-	vmulps	%xmm0 , %xmm8 , %xmm8
-	vmulps	%xmm0 , %xmm10, %xmm10
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-	vaddps 	        (CO2), %xmm8,%xmm8
-	vaddps 	        (CO2, LDC), %xmm10,%xmm10
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-	vmovups	%xmm8 ,  	(CO2)
-	vmovups	%xmm10,  	(CO2, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x4_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
-	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
-	addq	$ 4 , BI	
-	addq	$ 2, %rax 
-.endm
-
-.macro SAVE2x4
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm7 , %xmm7
-	vmulss	%xmm0 , %xmm8 , %xmm8
-	vmulss	%xmm0 , %xmm9 , %xmm9
-	vmulss	%xmm0 , %xmm10, %xmm10
-	vmulss	%xmm0 , %xmm11, %xmm11
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
-
-	vaddss 	        (CO2), %xmm8,%xmm8
-	vaddss  1 * SIZE(CO2), %xmm9,%xmm9
-
-	vaddss 	        (CO2, LDC), %xmm10,%xmm10
-	vaddss  1 * SIZE(CO2, LDC), %xmm11,%xmm11
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
-
-	vmovss	%xmm8 ,  	(CO2)
-	vmovss	%xmm9 , 1 * SIZE(CO2)
-
-	vmovss	%xmm10,  	(CO2, LDC)
-	vmovss	%xmm11, 1 * SIZE(CO2, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x4_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
-	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
-	addq	$ 4 , BI	
-	addq	$ 1, %rax 
-.endm
-
-.macro SAVE1x4
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm8 , %xmm8
-	vmulss	%xmm0 , %xmm10, %xmm10
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss 	        (CO2), %xmm8,%xmm8
-	vaddss 	        (CO2, LDC), %xmm10,%xmm10
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm8 ,  	(CO2)
-	vmovss	%xmm10,  	(CO2, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 2 lines of N
-*******************************************************************************************/
-
-.macro KERNEL16x2_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
-	addq	$ 2 , BI	
-	addq	$ 16, %rax 
-.endm
-
-.macro SAVE16x2
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm7 , %ymm7
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x2_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
-	addq	$ 2 , BI	
-	addq	$ 8 , %rax 
-.endm
-
-.macro SAVE8x2
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm6 , %ymm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm6 ,  	(CO1, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x2_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
-	addq	$ 2 , BI	
-	addq	$ 4 , %rax 
-.endm
-
-.macro SAVE4x2
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-	vmulps	%xmm0 , %xmm6 , %xmm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x2_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
-	addq	$ 2 , BI	
-	addq	$ 2, %rax 
-.endm
-
-.macro SAVE2x2
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm7 , %xmm7
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x2_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
-	addq	$ 2 , BI	
-	addq	$ 1, %rax 
-.endm
-
-.macro SAVE1x2
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm6 , %xmm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm6 ,  	(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 1 line of N
-*******************************************************************************************/
-
-.macro KERNEL16x1_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
-	addq	$ 1 , BI	
-	addq	$ 16, %rax 
-.endm
-
-.macro SAVE16x1
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x1_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
-	addq	$ 1 , BI	
-	addq	$ 8 , %rax 
-.endm
-
-.macro SAVE8x1
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x1_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
-	addq	$ 1 , BI	
-	addq	$ 4 , %rax 
-.endm
-
-.macro SAVE4x1
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x1_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
-	addq	$ 1 , BI	
-	addq	$ 2 , %rax 
-.endm
-
-.macro SAVE2x1
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x1_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
-	addq	$ 1 , BI	
-	addq	$ 1 , %rax 
-.endm
-
-.macro SAVE1x1
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-#if !defined(TRMMKERNEL)
-
-/*************************************************************************************
-* GEMM Kernel
-*************************************************************************************/
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovss	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $12,  %rdi
-        divq    %rdi                    //    N / 12
-        movq    %rax, Ndiv6             //    N / 12
-        movq    %rdx, Nmod6             //    N % 12
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L4_00
-	ALIGN_4
-
-
-/*******************************************************************************************/
-
-.L6_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	salq	$2, %rax		// 4 values of B
-        leaq    (B, %rax,4), BO2
-        movq    BO2, B                  // next offset of B
-        movq    K, %rax
-
-	ALIGN_4
-
-
-.L6_02c:
-
-	vmovups	(BO1), %xmm0
-	vmovsd	(BO2), %xmm1
-	vmovups	%xmm0, (BO)
-	vmovsd	%xmm1, 4*SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO2
-	addq	$ 6*SIZE,BO
-	decq	%rax
-	jnz	.L6_02c
-
-
-.L6_10:
-	movq	 C, CO1
-	leaq	(C,   LDC, 2), CO2	
-	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
-	leaq	(C,   LDC, 4), C	
-	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
-
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L6_20
-
-	ALIGN_4
-
-.L6_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L6_16
-
-	ALIGN_4
-
-.L6_12:
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	je	.L6_16
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	je	.L6_16
-
-	jmp	.L6_12
-	ALIGN_4
-
-.L6_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_19
-
-	ALIGN_4
-
-.L6_17:
-
-	KERNEL16x6_SUB
-
-	jnz	.L6_17
-	ALIGN_4
-
-
-.L6_19:
-
-	SAVE16x6
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	addq	$16 * SIZE, CO2		# coffset += 16
-	decq	I			# i --
-	jg	.L6_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L6_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L6_60		// to next 6 lines of N
-
-	testq	$8, M		
-	jz	.L6_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L6_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_20_6
-
-	ALIGN_4
-
-.L6_20_2:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	je	.L6_20_6
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	je	.L6_20_6
-
-	jmp	.L6_20_2
-	ALIGN_4
-
-.L6_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_20_9
-
-	ALIGN_4
-
-.L6_20_7:
-
-	KERNEL8x6_SUB
-
-	jnz	.L6_20_7
-	ALIGN_4
-
-
-.L6_20_9:
-
-	SAVE8x6
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	addq	$8 * SIZE, CO2		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L6_21pre:
-
-	testq	$4, M		
-	jz	.L6_30
-	ALIGN_4
-
-.L6_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_26
-
-	ALIGN_4
-
-.L6_22:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	je	.L6_26
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	je	.L6_26
-
-	jmp	.L6_22
-	ALIGN_4
-
-.L6_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_29
-
-	ALIGN_4
-
-.L6_27:
-
-	KERNEL4x6_SUB
-
-	jnz	.L6_27
-	ALIGN_4
-
-
-.L6_29:
-
-	SAVE4x6
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	addq	$4 * SIZE, CO2		# coffset += 4
-	ALIGN_4
-	
-
-.L6_30:
-	testq	$2, M		
-	jz	.L6_40
-
-	ALIGN_4
-
-.L6_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_36
-
-	ALIGN_4
-
-.L6_32:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	je	.L6_36
-
-	prefetcht0	A_PR1(AO)
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	je	.L6_36
-
-	jmp	.L6_32
-	ALIGN_4
-
-.L6_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_39
-
-	ALIGN_4
-
-.L6_37:
-
-	KERNEL2x6_SUB
-
-	jnz	.L6_37
-	ALIGN_4
-
-
-.L6_39:
-
-	SAVE2x6
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	addq	$2 * SIZE, CO2		# coffset += 2
-	ALIGN_4
-
-.L6_40:
-	testq	$1, M		
-	jz	.L6_60		// to next 4 lines of N
-
-	ALIGN_4
-
-.L6_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_46
-
-	ALIGN_4
-
-.L6_42:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	je	.L6_46
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	je	.L6_46
-
-	jmp	.L6_42
-	ALIGN_4
-
-.L6_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_49
-
-	ALIGN_4
-
-.L6_47:
-
-	KERNEL1x6_SUB
-
-	jnz	.L6_47
-	ALIGN_4
-
-
-.L6_49:
-
-	SAVE1x6
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	addq	$1 * SIZE, CO2		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L6_60:
-
-
-/*******************************************************************************************/
-
-
-.L7_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	salq	$2, %rax		// 4 values of B
-        leaq    (B, %rax,4), BO2
-        movq    K, %rax
-
-	ALIGN_4
-
-
-.L7_02c:
-
-	vmovsd	2*SIZE(BO1), %xmm0
-	vmovups	      (BO2), %xmm1
-	vmovsd	%xmm0, (BO)
-	vmovups	%xmm1, 2*SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO2
-	addq	$ 6*SIZE,BO
-	decq	%rax
-	jnz	.L7_02c
-
-        movq    BO2, B                  // next offset of B
-
-.L7_10:
-	movq	 C, CO1
-	leaq	(C,   LDC, 2), CO2	
-	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
-	leaq	(C,   LDC, 4), C	
-	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
-
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L7_20
-
-	ALIGN_4
-
-.L7_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L7_16
-
-	ALIGN_4
-
-.L7_12:
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	je	.L7_16
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-	KERNEL16x6_SUB
-
-	je	.L7_16
-
-	jmp	.L7_12
-	ALIGN_4
-
-.L7_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_19
-
-	ALIGN_4
-
-.L7_17:
-
-	KERNEL16x6_SUB
-
-	jnz	.L7_17
-	ALIGN_4
-
-
-.L7_19:
-
-	SAVE16x6
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	addq	$16 * SIZE, CO2		# coffset += 16
-	decq	I			# i --
-	jg	.L7_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L7_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L7_60		// to next 6 lines of N
-
-	testq	$8, M		
-	jz	.L7_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L7_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_20_6
-
-	ALIGN_4
-
-.L7_20_2:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	je	.L7_20_6
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-	prefetcht0	A_PR1(AO)
-	KERNEL8x6_SUB
-	KERNEL8x6_SUB
-
-	je	.L7_20_6
-
-	jmp	.L7_20_2
-	ALIGN_4
-
-.L7_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_20_9
-
-	ALIGN_4
-
-.L7_20_7:
-
-	KERNEL8x6_SUB
-
-	jnz	.L7_20_7
-	ALIGN_4
-
-
-.L7_20_9:
-
-	SAVE8x6
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	addq	$8 * SIZE, CO2		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L7_21pre:
-
-	testq	$4, M		
-	jz	.L7_30
-	ALIGN_4
-
-.L7_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_26
-
-	ALIGN_4
-
-.L7_22:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	je	.L7_26
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	prefetcht0	A_PR1(AO)
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-	KERNEL4x6_SUB
-
-	je	.L7_26
-
-	jmp	.L7_22
-	ALIGN_4
-
-.L7_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_29
-
-	ALIGN_4
-
-.L7_27:
-
-	KERNEL4x6_SUB
-
-	jnz	.L7_27
-	ALIGN_4
-
-
-.L7_29:
-
-	SAVE4x6
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	addq	$4 * SIZE, CO2		# coffset += 4
-	ALIGN_4
-	
-
-.L7_30:
-	testq	$2, M		
-	jz	.L7_40
-
-	ALIGN_4
-
-.L7_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_36
-
-	ALIGN_4
-
-.L7_32:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	je	.L7_36
-
-	prefetcht0	A_PR1(AO)
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-	KERNEL2x6_SUB
-
-	je	.L7_36
-
-	jmp	.L7_32
-	ALIGN_4
-
-.L7_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_39
-
-	ALIGN_4
-
-.L7_37:
-
-	KERNEL2x6_SUB
-
-	jnz	.L7_37
-	ALIGN_4
-
-
-.L7_39:
-
-	SAVE2x6
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	addq	$2 * SIZE, CO2		# coffset += 2
-	ALIGN_4
-
-.L7_40:
-	testq	$1, M		
-	jz	.L7_60		// to next 4 lines of N
-
-	ALIGN_4
-
-.L7_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_46
-
-	ALIGN_4
-
-.L7_42:
-
-	prefetcht0	A_PR1(AO)
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	je	.L7_46
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-	KERNEL1x6_SUB
-
-	je	.L7_46
-
-	jmp	.L7_42
-	ALIGN_4
-
-.L7_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_49
-
-	ALIGN_4
-
-.L7_47:
-
-	KERNEL1x6_SUB
-
-	jnz	.L7_47
-	ALIGN_4
-
-
-.L7_49:
-
-	SAVE1x6
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	addq	$1 * SIZE, CO2		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L7_60:
-
-	decq	J			// j --
-	jg	.L6_01			// next 12 lines of N
-
-
-
-
-/*******************************************************************************************/
-.L4_00:
-
- 	movq    Nmod6,  J
-        sarq    $2, J           // j = j / 4
-        cmpq    $ 0, J
-        je      .L2_00
-        ALIGN_4
-
-
-.L4_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L4_01b
-	ALIGN_4
-
-
-.L4_01a:
-        prefetcht0 512(BO1)
-        prefetchw  512(BO)
-
-	vmovups	       (BO1), %xmm0
-	vmovups	 4*SIZE(BO1), %xmm1
-	vmovups	 8*SIZE(BO1), %xmm2
-	vmovups	12*SIZE(BO1), %xmm3
-
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 4*SIZE(BO)
-	vmovups	%xmm2, 8*SIZE(BO)
-	vmovups	%xmm3,12*SIZE(BO)
-
-	addq	$ 16*SIZE,BO1
-	addq	$ 16*SIZE,BO
-	decq	%rax
-	jnz	.L4_01a
-
-
-.L4_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L4_02d
-        ALIGN_4
-
-.L4_02c:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0, (BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L4_02c
-
-.L4_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L4_10:
-	movq	 C, CO1
-	leaq	(C, LDC, 2), CO2	
-	leaq	(C, LDC, 4), C		// c += 4 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L4_20
-
-	ALIGN_4
-
-.L4_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             	// first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $4, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_12:
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	je	.L4_16
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	je	.L4_16
-
-	jmp	.L4_12
-	ALIGN_4
-
-.L4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_17:
-
-	KERNEL16x4_SUB
-
-	jl	.L4_17
-	ALIGN_4
-
-
-.L4_19:
-
-	SAVE16x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	addq	$16 * SIZE, CO2		# coffset += 16
-	decq	I			# i --
-	jg	.L4_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L4_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L4_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L4_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L4_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_20_2:
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	je	.L4_20_6
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	je	.L4_20_6
-
-	jmp	.L4_20_2
-	ALIGN_4
-
-.L4_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_20_7:
-
-	KERNEL8x4_SUB
-
-	jl	.L4_20_7
-	ALIGN_4
-
-
-.L4_20_9:
-
-	SAVE8x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	addq	$8 * SIZE, CO2		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L4_21pre:
-
-	testq	$4, M		
-	jz	.L4_30
-	ALIGN_4
-
-.L4_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_22:
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	je	.L4_26
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	je	.L4_26
-
-	jmp	.L4_22
-	ALIGN_4
-
-.L4_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_27:
-
-	KERNEL4x4_SUB
-
-	jl	.L4_27
-	ALIGN_4
-
-
-.L4_29:
-
-	SAVE4x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	addq	$4 * SIZE, CO2		# coffset += 4
-	ALIGN_4
-	
-
-.L4_30:
-	testq	$2, M		
-	jz	.L4_40
-
-	ALIGN_4
-
-.L4_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_32:
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	je	.L4_36
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	je	.L4_36
-
-	jmp	.L4_32
-	ALIGN_4
-
-.L4_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_39
-
-	movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_37:
-
-	KERNEL2x4_SUB
-
-	jl	.L4_37
-	ALIGN_4
-
-
-.L4_39:
-
-	SAVE2x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	addq	$2 * SIZE, CO2		# coffset += 2
-	ALIGN_4
-
-.L4_40:
-	testq	$1, M		
-	jz	.L4_60		// to next 4 lines of N
-
-	ALIGN_4
-
-.L4_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L4_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_42:
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	je	.L4_46
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	je	.L4_46
-
-	jmp	.L4_42
-	ALIGN_4
-
-.L4_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_47:
-
-	KERNEL1x4_SUB
-
-	jl	.L4_47
-	ALIGN_4
-
-
-.L4_49:
-
-	SAVE1x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	addq	$1 * SIZE, CO2		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L4_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $4, KK
-#endif
-
-	decq	J			// j --
-	jg	.L4_01			// next 4 lines of N
-
-
-
-/*******************************************************************************************/
-.L2_00:
-
-	movq	Nmod6, J		
-	andq	$3, J			// j % 4
-	je	.L999
-
-	movq	Nmod6, J		
-	andq	$2, J			// j % 4
-	je	.L1_0
-
-.L2_01:
-
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L2_01b
-	ALIGN_4
-
-.L2_01a:
-
-	vmovsd	      (BO1), %xmm0
-	vmovsd	2*SIZE(BO1), %xmm1
-	vmovsd	4*SIZE(BO1), %xmm2
-	vmovsd	6*SIZE(BO1), %xmm3
-
-	vmovsd	%xmm0,       (BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	vmovsd	%xmm2, 4*SIZE(BO)
-	vmovsd	%xmm3, 6*SIZE(BO)
-
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO
-	decq	%rax
-	jnz	.L2_01a
-
-
-.L2_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L2_02d
-        ALIGN_4
-
-.L2_02c:
-
-	vmovsd 	(BO1), %xmm0
-	vmovsd 	%xmm0, (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02c
-
-.L2_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $2, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	je	.L2_16
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB
-
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE16x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	je	.L2_20_6
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB
-
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	SAVE8x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_26
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_36
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_46
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovss	(BO1), %xmm0
-	vmovss	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $1, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	je	.L1_16
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB
-
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE16x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	je	.L1_20_6
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB
-
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	SAVE8x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_26
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	SAVE4x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_36
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_46
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-
-#else
-
-/*************************************************************************************
-* TRMM Kernel
-*************************************************************************************/
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovss	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $4,  %rdi
-        divq    %rdi                    //    N / 4
-        movq    %rax, Ndiv6             //    N / 4
-        movq    %rdx, Nmod6             //    N % 4
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L2_0
-	ALIGN_4
-
-/*******************************************************************************************/
-
-.L4_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L4_01b
-	ALIGN_4
-
-
-.L4_01a:
-        prefetcht0 512(BO1)
-        prefetchw  512(BO)
-
-	vmovups	       (BO1), %xmm0
-	vmovups	 4*SIZE(BO1), %xmm1
-	vmovups	 8*SIZE(BO1), %xmm2
-	vmovups	12*SIZE(BO1), %xmm3
-
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 4*SIZE(BO)
-	vmovups	%xmm2, 8*SIZE(BO)
-	vmovups	%xmm3,12*SIZE(BO)
-
-	addq	$ 16*SIZE,BO1
-	addq	$ 16*SIZE,BO
-	decq	%rax
-	jnz	.L4_01a
-
-
-.L4_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L4_02d
-        ALIGN_4
-
-.L4_02c:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0, (BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L4_02c
-
-.L4_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L4_10:
-	movq	 C, CO1
-	leaq	(C, LDC, 2), CO2	
-	leaq	(C, LDC, 4), C		// c += 4 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L4_20
-
-	ALIGN_4
-
-.L4_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             	// first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $4, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_12:
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	je	.L4_16
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	je	.L4_16
-
-	jmp	.L4_12
-	ALIGN_4
-
-.L4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_17:
-
-	KERNEL16x4_SUB
-
-	jl	.L4_17
-	ALIGN_4
-
-
-.L4_19:
-
-	SAVE16x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	addq	$16 * SIZE, CO2		# coffset += 16
-	decq	I			# i --
-	jg	.L4_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L4_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L4_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L4_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L4_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_20_2:
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	je	.L4_20_6
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	je	.L4_20_6
-
-	jmp	.L4_20_2
-	ALIGN_4
-
-.L4_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_20_7:
-
-	KERNEL8x4_SUB
-
-	jl	.L4_20_7
-	ALIGN_4
-
-
-.L4_20_9:
-
-	SAVE8x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	addq	$8 * SIZE, CO2		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L4_21pre:
-
-	testq	$4, M		
-	jz	.L4_30
-	ALIGN_4
-
-.L4_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_22:
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	je	.L4_26
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	je	.L4_26
-
-	jmp	.L4_22
-	ALIGN_4
-
-.L4_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_27:
-
-	KERNEL4x4_SUB
-
-	jl	.L4_27
-	ALIGN_4
-
-
-.L4_29:
-
-	SAVE4x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	addq	$4 * SIZE, CO2		# coffset += 4
-	ALIGN_4
-	
-
-.L4_30:
-	testq	$2, M		
-	jz	.L4_40
-
-	ALIGN_4
-
-.L4_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_32:
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	je	.L4_36
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	je	.L4_36
-
-	jmp	.L4_32
-	ALIGN_4
-
-.L4_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_39
-
-	movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_37:
-
-	KERNEL2x4_SUB
-
-	jl	.L4_37
-	ALIGN_4
-
-
-.L4_39:
-
-	SAVE2x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	addq	$2 * SIZE, CO2		# coffset += 2
-	ALIGN_4
-
-.L4_40:
-	testq	$1, M		
-	jz	.L4_60		// to next 4 lines of N
-
-	ALIGN_4
-
-.L4_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L4_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_42:
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	je	.L4_46
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	je	.L4_46
-
-	jmp	.L4_42
-	ALIGN_4
-
-.L4_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_47:
-
-	KERNEL1x4_SUB
-
-	jl	.L4_47
-	ALIGN_4
-
-
-.L4_49:
-
-	SAVE1x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	addq	$1 * SIZE, CO2		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L4_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $4, KK
-#endif
-
-	decq	J			// j --
-	jg	.L4_01			// next 4 lines of N
-
-
-
-/*******************************************************************************************/
-.L2_0:
-
-	movq	Nmod6, J		
-	andq	$3, J			// j % 4
-	je	.L999
-
-	movq	Nmod6, J		
-	andq	$2, J			// j % 4
-	je	.L1_0
-
-.L2_01:
-
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L2_01b
-	ALIGN_4
-
-.L2_01a:
-
-	vmovsd	      (BO1), %xmm0
-	vmovsd	2*SIZE(BO1), %xmm1
-	vmovsd	4*SIZE(BO1), %xmm2
-	vmovsd	6*SIZE(BO1), %xmm3
-
-	vmovsd	%xmm0,       (BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	vmovsd	%xmm2, 4*SIZE(BO)
-	vmovsd	%xmm3, 6*SIZE(BO)
-
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO
-	decq	%rax
-	jnz	.L2_01a
-
-
-.L2_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L2_02d
-        ALIGN_4
-
-.L2_02c:
-
-	vmovsd 	(BO1), %xmm0
-	vmovsd 	%xmm0, (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02c
-
-.L2_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $2, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	je	.L2_16
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB
-
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE16x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	je	.L2_20_6
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB
-
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	SAVE8x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_26
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_36
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_46
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovss	(BO1), %xmm0
-	vmovss	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $1, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	je	.L1_16
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB
-
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE16x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	je	.L1_20_6
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB
-
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	SAVE8x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_26
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	SAVE4x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_36
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_46
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#endif
-
+/*********************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************
+* 2014/07/28 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+* 2013/10/28 Saar
+* Parameter:
+*	SGEMM_DEFAULT_UNROLL_N	4
+*	SGEMM_DEFAULT_UNROLL_M	16
+*	SGEMM_DEFAULT_P		768
+*	SGEMM_DEFAULT_Q		384
+*	A_PR1			512
+*	B_PR1			512
+*	
+* 
+* 2014/07/28 Saar
+* Performance at 9216x9216x9216:
+*       1 thread:      102 GFLOPS       (SANDYBRIDGE:  59)      (MKL:   83)
+*       2 threads:     195 GFLOPS       (SANDYBRIDGE: 116)      (MKL:  155)
+*       3 threads:     281 GFLOPS       (SANDYBRIDGE: 165)      (MKL:  230)
+*       4 threads:     366 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  267)
+*
+*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define BO2	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define	CO2	%rdx
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+#if defined(OS_WINDOWS)
+#define L_BUFFER_SIZE 8192
+#else
+#define L_BUFFER_SIZE 12288
+#endif
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+#if defined(BULLDOZER)
+
+#define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
+
+#define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0
+
+#else
+
+#define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0
+
+#define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0
+
+#endif
+
+
+#define	A_PR1	512
+#define	B_PR1	512
+
+/*******************************************************************************************
+* 6 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x6_SUB
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vmovups 	 -8 * SIZE(AO), %ymm1
+	vbroadcastss	 -4 * SIZE(BO), %ymm2
+	vbroadcastss	 -3 * SIZE(BO), %ymm3
+	prefetcht0	A_PR1(AO)
+
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
+
+	vbroadcastss	 -2 * SIZE(BO), %ymm2
+	vbroadcastss	 -1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )
+
+	vbroadcastss	  0 * SIZE(BO), %ymm2
+	vbroadcastss	  1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm13,%ymm2,%ymm1  )
+	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm15,%ymm3,%ymm1 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 16*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE16x6
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm7 , %ymm7
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm9 , %ymm9
+	vmulps	%ymm0 , %ymm10, %ymm10
+	vmulps	%ymm0 , %ymm11, %ymm11
+	vmulps	%ymm0 , %ymm12, %ymm12
+	vmulps	%ymm0 , %ymm13, %ymm13
+	vmulps	%ymm0 , %ymm14, %ymm14
+	vmulps	%ymm0 , %ymm15, %ymm15
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
+
+	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
+	vaddps  8 * SIZE(CO1, LDC,2), %ymm9,%ymm9
+
+	vaddps 	        (CO2), %ymm10,%ymm10
+	vaddps  8 * SIZE(CO2), %ymm11,%ymm11
+
+	vaddps 	        (CO2, LDC), %ymm12,%ymm12
+	vaddps  8 * SIZE(CO2, LDC), %ymm13,%ymm13
+
+	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
+	vaddps  8 * SIZE(CO2, LDC,2), %ymm15,%ymm15
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
+
+	vmovups	%ymm8 ,  	(CO1, LDC,2)
+	vmovups	%ymm9 , 8 * SIZE(CO1, LDC,2)
+
+	vmovups	%ymm10,  	(CO2)
+	vmovups	%ymm11, 8 * SIZE(CO2)
+
+	vmovups	%ymm12,  	(CO2, LDC)
+	vmovups	%ymm13, 8 * SIZE(CO2, LDC)
+
+	vmovups	%ymm14,  	(CO2, LDC,2)
+	vmovups	%ymm15, 8 * SIZE(CO2, LDC,2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x6_SUB
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vbroadcastss	 -4 * SIZE(BO), %ymm2
+	vbroadcastss	 -3 * SIZE(BO), %ymm3
+
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %ymm2
+	vbroadcastss	 -1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %ymm2
+	vbroadcastss	  1 * SIZE(BO), %ymm3
+	VFMADD231PS_(  	%ymm12,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm14,%ymm3,%ymm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 8*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE8x6
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm10, %ymm10
+	vmulps	%ymm0 , %ymm12, %ymm12
+	vmulps	%ymm0 , %ymm14, %ymm14
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps 	        (CO1, LDC,2), %ymm8,%ymm8
+	vaddps 	        (CO2), %ymm10,%ymm10
+	vaddps 	        (CO2, LDC), %ymm12,%ymm12
+	vaddps 	        (CO2, LDC,2), %ymm14,%ymm14
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm8 ,  	(CO1, LDC,2)
+	vmovups	%ymm10,  	(CO2)
+	vmovups	%ymm12,  	(CO2, LDC)
+	vmovups	%ymm14,  	(CO2, LDC,2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x6_SUB
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vbroadcastss	 -4 * SIZE(BO), %xmm2
+	vbroadcastss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
+
+	vbroadcastss	 -2 * SIZE(BO), %xmm2
+	vbroadcastss	 -1 * SIZE(BO), %xmm3
+	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
+
+	vbroadcastss	  0 * SIZE(BO), %xmm2
+	vbroadcastss	  1 * SIZE(BO), %xmm3
+	VFMADD231PS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm14,%xmm3,%xmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 4*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE4x6
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+	vmulps	%xmm0 , %xmm8 , %xmm8
+	vmulps	%xmm0 , %xmm10, %xmm10
+	vmulps	%xmm0 , %xmm12, %xmm12
+	vmulps	%xmm0 , %xmm14, %xmm14
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+	vaddps 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddps 	        (CO2), %xmm10,%xmm10
+	vaddps 	        (CO2, LDC), %xmm12,%xmm12
+	vaddps 	        (CO2, LDC,2), %xmm14,%xmm14
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm8 ,  	(CO1, LDC,2)
+	vmovups	%xmm10,  	(CO2)
+	vmovups	%xmm12,  	(CO2, LDC)
+	vmovups	%xmm14,  	(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x6_SUB
+	vmovss 	-16 * SIZE(AO), %xmm0
+	vmovss 	-15 * SIZE(AO), %xmm1
+	vmovss	 -4 * SIZE(BO), %xmm2
+	vmovss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
+
+	vmovss	 -2 * SIZE(BO), %xmm2
+	vmovss	 -1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
+
+	vmovss	  0 * SIZE(BO), %xmm2
+	vmovss	  1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm13,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm15,%xmm3,%xmm1 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 2*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE2x6
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm9 , %xmm9
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm11, %xmm11
+	vmulss	%xmm0 , %xmm12, %xmm12
+	vmulss	%xmm0 , %xmm13, %xmm13
+	vmulss	%xmm0 , %xmm14, %xmm14
+	vmulss	%xmm0 , %xmm15, %xmm15
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddss  1 * SIZE(CO1, LDC,2), %xmm9,%xmm9
+
+	vaddss 	        (CO2), %xmm10,%xmm10
+	vaddss  1 * SIZE(CO2), %xmm11,%xmm11
+
+	vaddss 	        (CO2, LDC), %xmm12,%xmm12
+	vaddss  1 * SIZE(CO2, LDC), %xmm13,%xmm13
+
+	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
+	vaddss  1 * SIZE(CO2, LDC,2), %xmm15,%xmm15
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+	vmovss	%xmm8 ,  	(CO1, LDC,2)
+	vmovss	%xmm9 , 1 * SIZE(CO1, LDC,2)
+
+	vmovss	%xmm10,  	(CO2)
+	vmovss	%xmm11, 1 * SIZE(CO2)
+
+	vmovss	%xmm12,  	(CO2, LDC)
+	vmovss	%xmm13, 1 * SIZE(CO2, LDC)
+
+	vmovss	%xmm14,  	(CO2, LDC,2)
+	vmovss	%xmm15, 1 * SIZE(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x6_SUB
+	vmovss 	-16 * SIZE(AO), %xmm0
+	vmovss	 -4 * SIZE(BO), %xmm2
+	vmovss	 -3 * SIZE(BO), %xmm3
+
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+
+	vmovss	 -2 * SIZE(BO), %xmm2
+	vmovss	 -1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+
+	vmovss	  0 * SIZE(BO), %xmm2
+	vmovss	  1 * SIZE(BO), %xmm3
+	VFMADD231SS_(  	%xmm12,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm14,%xmm3,%xmm0 )
+
+	addq	$ 6*SIZE, BO 
+	addq	$ 1*SIZE, AO 
+	decq	%rax 
+.endm
+
+.macro SAVE1x6
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm12, %xmm12
+	vmulss	%xmm0 , %xmm14, %xmm14
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss 	        (CO1, LDC,2), %xmm8,%xmm8
+	vaddss 	        (CO2), %xmm10,%xmm10
+	vaddss 	        (CO2, LDC), %xmm12,%xmm12
+	vaddss 	        (CO2, LDC,2), %xmm14,%xmm14
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm8 ,  	(CO1, LDC,2)
+	vmovss	%xmm10,  	(CO2)
+	vmovss	%xmm12,  	(CO2, LDC)
+	vmovss	%xmm14,  	(CO2, LDC,2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+
+/*******************************************************************************************
+* 4 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm9,%ymm2,%ymm1  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm11,%ymm3,%ymm1 )
+	addq	$ 4 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x4
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm7 , %ymm7
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm9 , %ymm9
+	vmulps	%ymm0 , %ymm10, %ymm10
+	vmulps	%ymm0 , %ymm11, %ymm11
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
+
+	vaddps 	        (CO2), %ymm8,%ymm8
+	vaddps  8 * SIZE(CO2), %ymm9,%ymm9
+
+	vaddps 	        (CO2, LDC), %ymm10,%ymm10
+	vaddps  8 * SIZE(CO2, LDC), %ymm11,%ymm11
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
+
+	vmovups	%ymm8 ,  	(CO2)
+	vmovups	%ymm9 , 8 * SIZE(CO2)
+
+	vmovups	%ymm10,  	(CO2, LDC)
+	vmovups	%ymm11, 8 * SIZE(CO2, LDC)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+	prefetcht0	64(CO2)
+	prefetcht0	64(CO2, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm8,%ymm2,%ymm0  )
+	VFMADD231PS_(  	%ymm10,%ymm3,%ymm0 )
+	addq	$ 4 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x4
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm10, %ymm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps 	        (CO2), %ymm8,%ymm8
+	vaddps 	        (CO2, LDC), %ymm10,%ymm10
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm8 ,  	(CO2)
+	vmovups	%ymm10,  	(CO2, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231PS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231PS_(  	%xmm10,%xmm3,%xmm0 )
+	addq	$ 4 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x4
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+	vmulps	%xmm0 , %xmm8 , %xmm8
+	vmulps	%xmm0 , %xmm10, %xmm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+	vaddps 	        (CO2), %xmm8,%xmm8
+	vaddps 	        (CO2, LDC), %xmm10,%xmm10
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm8 ,  	(CO2)
+	vmovups	%xmm10,  	(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x4_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm9,%xmm2,%xmm1  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm11,%xmm3,%xmm1 )
+	addq	$ 4 , BI	
+	addq	$ 2, %rax 
+.endm
+
+.macro SAVE2x4
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm9 , %xmm9
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm11, %xmm11
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+	vaddss 	        (CO2), %xmm8,%xmm8
+	vaddss  1 * SIZE(CO2), %xmm9,%xmm9
+
+	vaddss 	        (CO2, LDC), %xmm10,%xmm10
+	vaddss  1 * SIZE(CO2, LDC), %xmm11,%xmm11
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+	vmovss	%xmm8 ,  	(CO2)
+	vmovss	%xmm9 , 1 * SIZE(CO2)
+
+	vmovss	%xmm10,  	(CO2, LDC)
+	vmovss	%xmm11, 1 * SIZE(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x4_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm8,%xmm2,%xmm0  )
+	VFMADD231SS_(  	%xmm10,%xmm3,%xmm0 )
+	addq	$ 4 , BI	
+	addq	$ 1, %rax 
+.endm
+
+.macro SAVE1x4
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm10, %xmm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss 	        (CO2), %xmm8,%xmm8
+	vaddss 	        (CO2, LDC), %xmm10,%xmm10
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm8 ,  	(CO2)
+	vmovss	%xmm10,  	(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 2 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	VFMADD231PS_(  	%ymm7,%ymm3,%ymm1 )
+	addq	$ 2 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x2
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm7 , %ymm7
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm6,%ymm3,%ymm0 )
+	addq	$ 2 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x2
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231PS_(  	%xmm6,%xmm3,%xmm0 )
+	addq	$ 2 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x2
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x2_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	VFMADD231SS_(  	%xmm7,%xmm3,%xmm1 )
+	addq	$ 2 , BI	
+	addq	$ 2, %rax 
+.endm
+
+.macro SAVE2x2
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x2_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm6,%xmm3,%xmm0 )
+	addq	$ 2 , BI	
+	addq	$ 1, %rax 
+.endm
+
+.macro SAVE1x2
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 1 line of N
+*******************************************************************************************/
+
+.macro KERNEL16x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	VFMADD231PS_(  	%ymm5,%ymm2,%ymm1 )
+	addq	$ 1 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x1
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PS_(  	%ymm4,%ymm2,%ymm0 )
+	addq	$ 1 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x1
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231PS_(  	%xmm4,%xmm2,%xmm0 )
+	addq	$ 1 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x1
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x1_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	VFMADD231SS_(  	%xmm5,%xmm2,%xmm1 )
+	addq	$ 1 , BI	
+	addq	$ 2 , %rax 
+.endm
+
+.macro SAVE2x1
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x1_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SS_(  	%xmm4,%xmm2,%xmm0 )
+	addq	$ 1 , BI	
+	addq	$ 1 , %rax 
+.endm
+
+.macro SAVE1x1
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $12,  %rdi
+        divq    %rdi                    //    N / 12
+        movq    %rax, Ndiv6             //    N / 12
+        movq    %rdx, Nmod6             //    N % 12
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L4_00
+	ALIGN_4
+
+
+/*******************************************************************************************/
+
+.L6_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 4 values of B
+        leaq    (B, %rax,4), BO2
+        movq    BO2, B                  // next offset of B
+        movq    K, %rax
+
+	ALIGN_4
+
+
+.L6_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovsd	(BO2), %xmm1
+	vmovups	%xmm0, (BO)
+	vmovsd	%xmm1, 4*SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L6_02c
+
+
+.L6_10:
+	movq	 C, CO1
+	leaq	(C,   LDC, 2), CO2	
+	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
+	leaq	(C,   LDC, 4), C	
+	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L6_20
+
+	ALIGN_4
+
+.L6_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L6_16
+
+	ALIGN_4
+
+.L6_12:
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L6_16
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L6_16
+
+	jmp	.L6_12
+	ALIGN_4
+
+.L6_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_19
+
+	ALIGN_4
+
+.L6_17:
+
+	KERNEL16x6_SUB
+
+	jnz	.L6_17
+	ALIGN_4
+
+
+.L6_19:
+
+	SAVE16x6
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L6_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L6_60		// to next 6 lines of N
+
+	testq	$8, M		
+	jz	.L6_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L6_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_20_6
+
+	ALIGN_4
+
+.L6_20_2:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L6_20_6
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L6_20_6
+
+	jmp	.L6_20_2
+	ALIGN_4
+
+.L6_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_20_9
+
+	ALIGN_4
+
+.L6_20_7:
+
+	KERNEL8x6_SUB
+
+	jnz	.L6_20_7
+	ALIGN_4
+
+
+.L6_20_9:
+
+	SAVE8x6
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L6_21pre:
+
+	testq	$4, M		
+	jz	.L6_30
+	ALIGN_4
+
+.L6_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_26
+
+	ALIGN_4
+
+.L6_22:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L6_26
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L6_26
+
+	jmp	.L6_22
+	ALIGN_4
+
+.L6_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_29
+
+	ALIGN_4
+
+.L6_27:
+
+	KERNEL4x6_SUB
+
+	jnz	.L6_27
+	ALIGN_4
+
+
+.L6_29:
+
+	SAVE4x6
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L6_30:
+	testq	$2, M		
+	jz	.L6_40
+
+	ALIGN_4
+
+.L6_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_36
+
+	ALIGN_4
+
+.L6_32:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L6_36
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L6_36
+
+	jmp	.L6_32
+	ALIGN_4
+
+.L6_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_39
+
+	ALIGN_4
+
+.L6_37:
+
+	KERNEL2x6_SUB
+
+	jnz	.L6_37
+	ALIGN_4
+
+
+.L6_39:
+
+	SAVE2x6
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L6_40:
+	testq	$1, M		
+	jz	.L6_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L6_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_46
+
+	ALIGN_4
+
+.L6_42:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L6_46
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L6_46
+
+	jmp	.L6_42
+	ALIGN_4
+
+.L6_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_49
+
+	ALIGN_4
+
+.L6_47:
+
+	KERNEL1x6_SUB
+
+	jnz	.L6_47
+	ALIGN_4
+
+
+.L6_49:
+
+	SAVE1x6
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L6_60:
+
+
+/*******************************************************************************************/
+
+
+.L7_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 4 values of B
+        leaq    (B, %rax,4), BO2
+        movq    K, %rax
+
+	ALIGN_4
+
+
+.L7_02c:
+
+	vmovsd	2*SIZE(BO1), %xmm0
+	vmovups	      (BO2), %xmm1
+	vmovsd	%xmm0, (BO)
+	vmovups	%xmm1, 2*SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L7_02c
+
+        movq    BO2, B                  // next offset of B
+
+.L7_10:
+	movq	 C, CO1
+	leaq	(C,   LDC, 2), CO2	
+	leaq	(CO2, LDC, 1), CO2		// co2 = c + 3 * ldc
+	leaq	(C,   LDC, 4), C	
+	leaq	(C,   LDC, 2), C		// c = c + 6 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L7_20
+
+	ALIGN_4
+
+.L7_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L7_16
+
+	ALIGN_4
+
+.L7_12:
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L7_16
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+	KERNEL16x6_SUB
+
+	je	.L7_16
+
+	jmp	.L7_12
+	ALIGN_4
+
+.L7_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_19
+
+	ALIGN_4
+
+.L7_17:
+
+	KERNEL16x6_SUB
+
+	jnz	.L7_17
+	ALIGN_4
+
+
+.L7_19:
+
+	SAVE16x6
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L7_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_60		// to next 6 lines of N
+
+	testq	$8, M		
+	jz	.L7_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L7_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_20_6
+
+	ALIGN_4
+
+.L7_20_2:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L7_20_6
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+	prefetcht0	A_PR1(AO)
+	KERNEL8x6_SUB
+	KERNEL8x6_SUB
+
+	je	.L7_20_6
+
+	jmp	.L7_20_2
+	ALIGN_4
+
+.L7_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_20_9
+
+	ALIGN_4
+
+.L7_20_7:
+
+	KERNEL8x6_SUB
+
+	jnz	.L7_20_7
+	ALIGN_4
+
+
+.L7_20_9:
+
+	SAVE8x6
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L7_21pre:
+
+	testq	$4, M		
+	jz	.L7_30
+	ALIGN_4
+
+.L7_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_26
+
+	ALIGN_4
+
+.L7_22:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L7_26
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	prefetcht0	A_PR1(AO)
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+	KERNEL4x6_SUB
+
+	je	.L7_26
+
+	jmp	.L7_22
+	ALIGN_4
+
+.L7_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_29
+
+	ALIGN_4
+
+.L7_27:
+
+	KERNEL4x6_SUB
+
+	jnz	.L7_27
+	ALIGN_4
+
+
+.L7_29:
+
+	SAVE4x6
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L7_30:
+	testq	$2, M		
+	jz	.L7_40
+
+	ALIGN_4
+
+.L7_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_36
+
+	ALIGN_4
+
+.L7_32:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L7_36
+
+	prefetcht0	A_PR1(AO)
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+	KERNEL2x6_SUB
+
+	je	.L7_36
+
+	jmp	.L7_32
+	ALIGN_4
+
+.L7_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_39
+
+	ALIGN_4
+
+.L7_37:
+
+	KERNEL2x6_SUB
+
+	jnz	.L7_37
+	ALIGN_4
+
+
+.L7_39:
+
+	SAVE2x6
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L7_40:
+	testq	$1, M		
+	jz	.L7_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L7_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_46
+
+	ALIGN_4
+
+.L7_42:
+
+	prefetcht0	A_PR1(AO)
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L7_46
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+	KERNEL1x6_SUB
+
+	je	.L7_46
+
+	jmp	.L7_42
+	ALIGN_4
+
+.L7_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_49
+
+	ALIGN_4
+
+.L7_47:
+
+	KERNEL1x6_SUB
+
+	jnz	.L7_47
+	ALIGN_4
+
+
+.L7_49:
+
+	SAVE1x6
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L7_60:
+
+	decq	J			// j --
+	jg	.L6_01			// next 12 lines of N
+
+
+
+
+/*******************************************************************************************/
+.L4_00:
+
+ 	movq    Nmod6,  J
+        sarq    $2, J           // j = j / 4
+        cmpq    $ 0, J
+        je      .L2_00
+        ALIGN_4
+
+
+.L4_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L4_01b
+	ALIGN_4
+
+
+.L4_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	       (BO1), %xmm0
+	vmovups	 4*SIZE(BO1), %xmm1
+	vmovups	 8*SIZE(BO1), %xmm2
+	vmovups	12*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 4*SIZE(BO)
+	vmovups	%xmm2, 8*SIZE(BO)
+	vmovups	%xmm3,12*SIZE(BO)
+
+	addq	$ 16*SIZE,BO1
+	addq	$ 16*SIZE,BO
+	decq	%rax
+	jnz	.L4_01a
+
+
+.L4_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L4_02d
+        ALIGN_4
+
+.L4_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L4_02c
+
+.L4_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L4_10:
+	movq	 C, CO1
+	leaq	(C, LDC, 2), CO2	
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             	// first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $4, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_12:
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	jmp	.L4_12
+	ALIGN_4
+
+.L4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL16x4_SUB
+
+	jl	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE16x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L4_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L4_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L4_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_2:
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	jmp	.L4_20_2
+	ALIGN_4
+
+.L4_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_7:
+
+	KERNEL8x4_SUB
+
+	jl	.L4_20_7
+	ALIGN_4
+
+
+.L4_20_9:
+
+	SAVE8x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L4_21pre:
+
+	testq	$4, M		
+	jz	.L4_30
+	ALIGN_4
+
+.L4_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	jmp	.L4_22
+	ALIGN_4
+
+.L4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_27:
+
+	KERNEL4x4_SUB
+
+	jl	.L4_27
+	ALIGN_4
+
+
+.L4_29:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	jmp	.L4_32
+	ALIGN_4
+
+.L4_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	jl	.L4_37
+	ALIGN_4
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L4_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	jmp	.L4_42
+	ALIGN_4
+
+.L4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	jl	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK
+#endif
+
+	decq	J			// j --
+	jg	.L4_01			// next 4 lines of N
+
+
+
+/*******************************************************************************************/
+.L2_00:
+
+	movq	Nmod6, J		
+	andq	$3, J			// j % 4
+	je	.L999
+
+	movq	Nmod6, J		
+	andq	$2, J			// j % 4
+	je	.L1_0
+
+.L2_01:
+
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+
+	vmovsd	      (BO1), %xmm0
+	vmovsd	2*SIZE(BO1), %xmm1
+	vmovsd	4*SIZE(BO1), %xmm2
+	vmovsd	6*SIZE(BO1), %xmm3
+
+	vmovsd	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovsd 	(BO1), %xmm0
+	vmovsd 	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+#else
+
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $4,  %rdi
+        divq    %rdi                    //    N / 4
+        movq    %rax, Ndiv6             //    N / 4
+        movq    %rdx, Nmod6             //    N % 4
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L2_0
+	ALIGN_4
+
+/*******************************************************************************************/
+
+.L4_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L4_01b
+	ALIGN_4
+
+
+.L4_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	       (BO1), %xmm0
+	vmovups	 4*SIZE(BO1), %xmm1
+	vmovups	 8*SIZE(BO1), %xmm2
+	vmovups	12*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 4*SIZE(BO)
+	vmovups	%xmm2, 8*SIZE(BO)
+	vmovups	%xmm3,12*SIZE(BO)
+
+	addq	$ 16*SIZE,BO1
+	addq	$ 16*SIZE,BO
+	decq	%rax
+	jnz	.L4_01a
+
+
+.L4_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L4_02d
+        ALIGN_4
+
+.L4_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L4_02c
+
+.L4_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L4_10:
+	movq	 C, CO1
+	leaq	(C, LDC, 2), CO2	
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             	// first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $4, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_12:
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	jmp	.L4_12
+	ALIGN_4
+
+.L4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL16x4_SUB
+
+	jl	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE16x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L4_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L4_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L4_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_2:
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	jmp	.L4_20_2
+	ALIGN_4
+
+.L4_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_7:
+
+	KERNEL8x4_SUB
+
+	jl	.L4_20_7
+	ALIGN_4
+
+
+.L4_20_9:
+
+	SAVE8x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L4_21pre:
+
+	testq	$4, M		
+	jz	.L4_30
+	ALIGN_4
+
+.L4_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	jmp	.L4_22
+	ALIGN_4
+
+.L4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_27:
+
+	KERNEL4x4_SUB
+
+	jl	.L4_27
+	ALIGN_4
+
+
+.L4_29:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	jmp	.L4_32
+	ALIGN_4
+
+.L4_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	jl	.L4_37
+	ALIGN_4
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L4_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	jmp	.L4_42
+	ALIGN_4
+
+.L4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	jl	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK
+#endif
+
+	decq	J			// j --
+	jg	.L4_01			// next 4 lines of N
+
+
+
+/*******************************************************************************************/
+.L2_0:
+
+	movq	Nmod6, J		
+	andq	$3, J			// j % 4
+	je	.L999
+
+	movq	Nmod6, J		
+	andq	$2, J			// j % 4
+	je	.L1_0
+
+.L2_01:
+
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+
+	vmovsd	      (BO1), %xmm0
+	vmovsd	2*SIZE(BO1), %xmm1
+	vmovsd	4*SIZE(BO1), %xmm2
+	vmovsd	6*SIZE(BO1), %xmm3
+
+	vmovsd	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovsd 	(BO1), %xmm0
+	vmovsd 	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#endif
+
diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h
index 36b7aa1a3..970d63578 100644
--- a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h
+++ b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h
@@ -1,226 +1,226 @@
-/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
-/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
-/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
-
-#define init_m8n4(c1,c2,c3,c4)\
-  "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\
-  "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";"
-#define INIT_m8n4 init_m8n4(4,5,6,7)
-#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11)
-#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15)
-
-#define init_m4n4(c1,c2,c3,c4)\
-  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\
-  "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";"
-#define INIT_m4n4 init_m4n4(4,5,6,7)
-#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11)
-#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15)
-
-#define init_m2n4(c1,c2)\
-  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"
-#define INIT_m2n4 init_m2n4(4,5)
-#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7)
-#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9)
-
-#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";"
-#define INIT_m1n4 init_m1n4(4)
-#define INIT_m1n8 INIT_m1n4 init_m1n4(5)
-#define INIT_m1n12 INIT_m1n8 init_m1n4(6)
-
-#define GEMM_KERNEL_k1m8n4 \
-  "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
-  "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\
-  "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;"
-#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\
-  "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\
-  "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;"
-#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\
-  "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\
-  "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;"
-
-#define GEMM_KERNEL_k1m4n4 \
-  "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\
-  "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
-  "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
-#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\
-  "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\
-  "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;"
-#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\
-  "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\
-  "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;"
-
-#define GEMM_KERNEL_k1m2n4 \
-  "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\
-  "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"
-#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\
-  "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
-#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\
-  "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"
-
-#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;"
-#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;"
-#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;"
-
-#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\
-  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
-  "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\
-  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
-  "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";"
-
-#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\
-  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
-  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
-  "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
-  "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\
-  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
-  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
-  "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\
-  "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\
-  "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";"
-
-#define GEMM_SUM_REORDER_2x4(c1,c2)\
-  "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\
-  "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\
-  "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\
-
-#define GEMM_SUM_REORDER_1x4(c1)\
-  "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
-  "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
-  "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_le_m4n2(b_off,c1,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
-  "vmovsldup %%ymm"#c1",%%ymm1;"
-
-#define SOLVE_le_m8n2(b_off,c1,c2,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
-  "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;"
-
-#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\
-  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
-
-#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
-
-#define SOLVE_ri_m4n2(b_off,c1,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
-  "vmovshdup %%ymm"#c1",%%ymm1;"
-
-#define SOLVE_ri_m8n2(b_off,c1,c2,...)\
-  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
-  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
-  "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;"
-
-#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\
-  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
-
-#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
-
-#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $0,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col2_mul_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $85,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col3_mul_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $170,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
-  "vpermilps $255,%%xmm"#c1",%%xmm1;"
-
-#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\
-  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
-  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
-  "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;"
-
-#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\
-  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\
-  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
-
-#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
-
-#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
-
-#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
-
-#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\
-  "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
-  "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\
-  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\
-  "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"
-
-#define SAVE_SOLUTION_m4n2(c1,a_off)\
-  "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\
-  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
-
-#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\
-  "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"
-
-#define SAVE_SOLUTION_m1n4(c1,a_off)\
-  "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
-  "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
+/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
+/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
+/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
+
+#define init_m8n4(c1,c2,c3,c4)\
+  "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\
+  "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";"
+#define INIT_m8n4 init_m8n4(4,5,6,7)
+#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11)
+#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15)
+
+#define init_m4n4(c1,c2,c3,c4)\
+  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\
+  "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";"
+#define INIT_m4n4 init_m4n4(4,5,6,7)
+#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11)
+#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15)
+
+#define init_m2n4(c1,c2)\
+  "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"
+#define INIT_m2n4 init_m2n4(4,5)
+#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7)
+#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9)
+
+#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";"
+#define INIT_m1n4 init_m1n4(4)
+#define INIT_m1n8 INIT_m1n4 init_m1n4(5)
+#define INIT_m1n12 INIT_m1n8 init_m1n4(6)
+
+#define GEMM_KERNEL_k1m8n4 \
+  "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
+  "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\
+  "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;"
+#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\
+  "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\
+  "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;"
+#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\
+  "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\
+  "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;"
+
+#define GEMM_KERNEL_k1m4n4 \
+  "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\
+  "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
+  "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
+#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\
+  "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\
+  "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;"
+#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\
+  "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\
+  "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;"
+
+#define GEMM_KERNEL_k1m2n4 \
+  "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\
+  "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"
+#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\
+  "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
+#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\
+  "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"
+
+#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;"
+#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;"
+#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;"
+
+#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\
+  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
+  "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\
+  "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
+  "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";"
+
+#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\
+  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
+  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
+  "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
+  "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\
+  "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
+  "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
+  "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\
+  "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\
+  "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";"
+
+#define GEMM_SUM_REORDER_2x4(c1,c2)\
+  "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\
+  "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\
+  "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\
+
+#define GEMM_SUM_REORDER_1x4(c1)\
+  "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
+  "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
+  "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_le_m4n2(b_off,c1,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
+  "vmovsldup %%ymm"#c1",%%ymm1;"
+
+#define SOLVE_le_m8n2(b_off,c1,c2,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
+  "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;"
+
+#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\
+  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
+
+#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
+
+#define SOLVE_ri_m4n2(b_off,c1,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
+  "vmovshdup %%ymm"#c1",%%ymm1;"
+
+#define SOLVE_ri_m8n2(b_off,c1,c2,...)\
+  "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
+  "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
+  "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;"
+
+#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\
+  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
+
+#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
+
+#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $0,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col2_mul_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $85,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col3_mul_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $170,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
+  "vpermilps $255,%%xmm"#c1",%%xmm1;"
+
+#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\
+  "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
+  "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
+  "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;"
+
+#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\
+  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\
+  "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
+
+#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
+
+#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
+
+#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
+
+#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\
+  "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
+  "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\
+  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\
+  "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"
+
+#define SAVE_SOLUTION_m4n2(c1,a_off)\
+  "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\
+  "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
+
+#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\
+  "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"
+
+#define SAVE_SOLUTION_m1n4(c1,a_off)\
+  "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
+  "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
diff --git a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
index 94e2f6117..6c8b4c872 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_bulldozer.S
@@ -1,1404 +1,1404 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 320
-
-#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
-#define OLD_A           48 + STACKSIZE(%rsp)
-#define OLD_B           56 + STACKSIZE(%rsp)
-#define OLD_C           64 + STACKSIZE(%rsp)
-#define OLD_LDC         72 + STACKSIZE(%rsp)
-#define OLD_OFFSET      80 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA_R  48(%rsp)
-#define ALPHA_I  56(%rsp)
-#define OFFSET   64(%rsp)
-#define KK       72(%rsp)
-#define KKK      80(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define VFMADD_R    vfmaddpd
-#define VFMADD_I    vfmaddpd
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define VFMADD_R    vfnmaddpd
-#define VFMADD_I    vfmaddpd
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define VFMADD_R    vfmaddpd
-#define VFMADD_I    vfnmaddpd
-#else
-#define VFMADD_R    vfnmaddpd
-#define VFMADD_I    vfnmaddpd
-#endif
-
-
-#define	A_PR1	384
-#define	B_PR1	192
-
-#define KERNEL2x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_2(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_4(xx) \
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $16, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x2_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $4, BI                            ;\
-        addq    $4, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL1x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_2(xx) \
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_3(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_4(xx) \
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $16, BI                            ;\
-        addq    $8 , %rax                          ;\
-
-
-#define KERNEL1x2_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $4, BI                            ;\
-        addq    $2, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL2x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_2(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_4(xx) \
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $8, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x1_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $2, BI                            ;\
-        addq    $4, %rax                          ;\
-
-
-/************************************************************************************************/
-
-#define KERNEL1x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_2(xx) \
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_3(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_4(xx) \
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $8, BI                            ;\
-        addq    $8, %rax                          ;\
-
-
-#define KERNEL1x1_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $2, BI                            ;\
-        addq    $2, %rax                          ;\
-
-
-/************************************************************************************************/
-
-
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	vmovsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA_R
-	vmovsd	 %xmm1, ALPHA_I
-
-	salq	$ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-.L2_0:
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm1
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	addq	$4*SIZE,BO1
-	addq	$4*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$1, I			// i = (m >> 1)
-	je	.L2_40
-
-	ALIGN_4
-
-.L2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL2x2_SUB(xxx)
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $0x01, %xmm15, %xmm15, %xmm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm10, %xmm10, %xmm11
-        vshufpd $0x01, %xmm12, %xmm12, %xmm13
-        vshufpd $0x01, %xmm14, %xmm14, %xmm15
-
-#else
-        vaddsubpd %xmm8,  %xmm9 ,%xmm9
-        vaddsubpd %xmm10, %xmm11,%xmm11
-        vaddsubpd %xmm12, %xmm13,%xmm13
-        vaddsubpd %xmm14, %xmm15,%xmm15
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-        vmovapd   %xmm13, %xmm12
-        vmovapd   %xmm15, %xmm14
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $0x01, %xmm15, %xmm15, %xmm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-        vmulpd  %xmm12, %xmm0, %xmm12
-        vmulpd  %xmm14, %xmm0, %xmm14
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-        vmulpd  %xmm13, %xmm1, %xmm13
-        vmulpd  %xmm15, %xmm1, %xmm15
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubpd %xmm8, %xmm9, %xmm9
-        vaddsubpd %xmm10,%xmm11, %xmm11
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$1, I			// i = (m >> 1)
-	je	.L1_40
-
-	ALIGN_4
-
-.L1_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL2x1_SUB(xxx)
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8  , %xmm8
-        vaddsubpd %xmm13,%xmm12 , %xmm12
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm12, %xmm12, %xmm13
-
-#else
-        vaddsubpd %xmm8, %xmm9 , %xmm9
-        vaddsubpd %xmm12,%xmm13, %xmm13
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm13, %xmm12
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm12, %xmm0, %xmm12
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm13, %xmm1, %xmm13
-
-	vaddsubpd %xmm9,  %xmm8 , %xmm8
-        vaddsubpd %xmm13, %xmm12, %xmm12
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8,  %xmm8
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubpd %xmm8, %xmm9,  %xmm9
-
-        vmovapd   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-
-	vaddsubpd %xmm9 ,%xmm8,  %xmm8
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 320
+
+#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
+#define OLD_A           48 + STACKSIZE(%rsp)
+#define OLD_B           56 + STACKSIZE(%rsp)
+#define OLD_C           64 + STACKSIZE(%rsp)
+#define OLD_LDC         72 + STACKSIZE(%rsp)
+#define OLD_OFFSET      80 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA_R  48(%rsp)
+#define ALPHA_I  56(%rsp)
+#define OFFSET   64(%rsp)
+#define KK       72(%rsp)
+#define KKK      80(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define VFMADD_R    vfmaddpd
+#define VFMADD_I    vfmaddpd
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define VFMADD_R    vfnmaddpd
+#define VFMADD_I    vfmaddpd
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define VFMADD_R    vfmaddpd
+#define VFMADD_I    vfnmaddpd
+#else
+#define VFMADD_R    vfnmaddpd
+#define VFMADD_I    vfnmaddpd
+#endif
+
+
+#define	A_PR1	384
+#define	B_PR1	192
+
+#define KERNEL2x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_2(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_4(xx) \
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $16, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x2_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $4, BI                            ;\
+        addq    $4, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL1x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_2(xx) \
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_3(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_4(xx) \
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $16, BI                            ;\
+        addq    $8 , %rax                          ;\
+
+
+#define KERNEL1x2_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $4, BI                            ;\
+        addq    $2, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL2x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_2(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_4(xx) \
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $8, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x1_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $2, BI                            ;\
+        addq    $4, %rax                          ;\
+
+
+/************************************************************************************************/
+
+#define KERNEL1x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_2(xx) \
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_3(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_4(xx) \
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $8, BI                            ;\
+        addq    $8, %rax                          ;\
+
+
+#define KERNEL1x1_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $2, BI                            ;\
+        addq    $2, %rax                          ;\
+
+
+/************************************************************************************************/
+
+
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	vmovsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA_R
+	vmovsd	 %xmm1, ALPHA_I
+
+	salq	$ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+.L2_0:
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	addq	$4*SIZE,BO1
+	addq	$4*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$1, I			// i = (m >> 1)
+	je	.L2_40
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL2x2_SUB(xxx)
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $0x01, %xmm15, %xmm15, %xmm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $0x01, %xmm12, %xmm12, %xmm13
+        vshufpd $0x01, %xmm14, %xmm14, %xmm15
+
+#else
+        vaddsubpd %xmm8,  %xmm9 ,%xmm9
+        vaddsubpd %xmm10, %xmm11,%xmm11
+        vaddsubpd %xmm12, %xmm13,%xmm13
+        vaddsubpd %xmm14, %xmm15,%xmm15
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm13, %xmm12
+        vmovapd   %xmm15, %xmm14
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $0x01, %xmm15, %xmm15, %xmm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm12, %xmm0, %xmm12
+        vmulpd  %xmm14, %xmm0, %xmm14
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm13, %xmm1, %xmm13
+        vmulpd  %xmm15, %xmm1, %xmm15
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubpd %xmm8, %xmm9, %xmm9
+        vaddsubpd %xmm10,%xmm11, %xmm11
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$1, I			// i = (m >> 1)
+	je	.L1_40
+
+	ALIGN_4
+
+.L1_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL2x1_SUB(xxx)
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8  , %xmm8
+        vaddsubpd %xmm13,%xmm12 , %xmm12
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm12, %xmm12, %xmm13
+
+#else
+        vaddsubpd %xmm8, %xmm9 , %xmm9
+        vaddsubpd %xmm12,%xmm13, %xmm13
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm13, %xmm12
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm12, %xmm0, %xmm12
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm13, %xmm1, %xmm13
+
+	vaddsubpd %xmm9,  %xmm8 , %xmm8
+        vaddsubpd %xmm13, %xmm12, %xmm12
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8,  %xmm8
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubpd %xmm8, %xmm9,  %xmm9
+
+        vmovapd   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+
+	vaddsubpd %xmm9 ,%xmm8,  %xmm8
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
index 848b6f237..bffe5439d 100644
--- a/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
+++ b/kernel/x86_64/zgemm_kernel_2x2_piledriver.S
@@ -1,1429 +1,1429 @@
-/***************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/*********************************************************************
-*
-* 2014/06/28 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-*
-* 2013/10/30 Saar
-*
-* Parameter:
-*       UNROLL_M        2
-*       UNROLL_N        2
-*       ZGEMM_P         384
-*       ZGEMM_Q         168
-*       A_PR1           512
-*       B_PR1           256
-*
-* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
-* 
-* 3456x3456     82.4    GFLOPS with 8 threads on 4 modules (ACML:  76.3 ) (BULLDOZER:  81.0 )
-* 3456x3456     79.9    GFLOPS with 4 threads on 4 modules (ACML:  69.9 ) (BULLDOZER:  74.6 )
-* 3456x3456     40.4    GFLOPS with 2 threads on 2 modules (ACML:  35.8 ) (BULLDOZER:  37.9 )
-* 3456x3456     20.3    GFLOPS with 1 threads on 1 modules (ACML:  18.1 ) (BULLDOZER:  19.2 )
-*
-* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
-* 
-* 6912x6912    227.5    GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 )
-* 6912x6912    211.6    GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 )
-* 6912x6912    123.5    GFLOPS with  8 threads on  8 modules (ACML:  92.7 ) (BULLDOZER: 117.0 )
-* 3456x3456     64.1    GFLOPS with  4 threads on  4 modules (ACML:  49.1 ) (BULLDOZER:  61.7 )
-* 3456x3456     33.4    GFLOPS with  2 threads on  2 modules (ACML:  28.1 ) (BULLDOZER:  30.9 )
-* 3456x3456     17.0    GFLOPS with  1 threads on  1 modules (ACML:  15.2 ) (BULLDOZER:  15.7 )
-*
-*********************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 320
-
-#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
-#define OLD_A           48 + STACKSIZE(%rsp)
-#define OLD_B           56 + STACKSIZE(%rsp)
-#define OLD_C           64 + STACKSIZE(%rsp)
-#define OLD_LDC         72 + STACKSIZE(%rsp)
-#define OLD_OFFSET      80 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 256*8*4
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA_R  48(%rsp)
-#define ALPHA_I  56(%rsp)
-#define OFFSET   64(%rsp)
-#define KK       72(%rsp)
-#define KKK      80(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define VFMADD_R    vfmaddpd
-#define VFMADD_I    vfmaddpd
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define VFMADD_R    vfnmaddpd
-#define VFMADD_I    vfmaddpd
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define VFMADD_R    vfmaddpd
-#define VFMADD_I    vfnmaddpd
-#else
-#define VFMADD_R    vfnmaddpd
-#define VFMADD_I    vfnmaddpd
-#endif
-
-
-#define	A_PR1	512
-#define	B_PR1	256
-
-#define KERNEL2x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_2(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL2x2_4(xx) \
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $16, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x2_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $4, BI                            ;\
-        addq    $4, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL1x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_2(xx) \
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_3(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_4(xx) \
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $16, BI                            ;\
-        addq    $8 , %rax                          ;\
-
-
-#define KERNEL1x2_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $4, BI                            ;\
-        addq    $2, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL2x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_2(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL2x1_4(xx) \
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $8, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x1_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $2, BI                            ;\
-        addq    $4, %rax                          ;\
-
-
-/************************************************************************************************/
-
-#define KERNEL1x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_2(xx) \
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_3(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_4(xx) \
-        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $8, BI                            ;\
-        addq    $8, %rax                          ;\
-
-
-#define KERNEL1x1_SUB(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $2, BI                            ;\
-        addq    $2, %rax                          ;\
-
-
-/************************************************************************************************/
-
-
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	vmovsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA_R
-	vmovsd	 %xmm1, ALPHA_I
-
-	salq	$ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-.L2_0:
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm1
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	addq	$4*SIZE,BO1
-	addq	$4*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$1, I			// i = (m >> 1)
-	je	.L2_40
-
-	ALIGN_4
-
-.L2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL2x2_SUB(xxx)
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $0x01, %xmm15, %xmm15, %xmm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm10, %xmm10, %xmm11
-        vshufpd $0x01, %xmm12, %xmm12, %xmm13
-        vshufpd $0x01, %xmm14, %xmm14, %xmm15
-
-#else
-        vaddsubpd %xmm8,  %xmm9 ,%xmm9
-        vaddsubpd %xmm10, %xmm11,%xmm11
-        vaddsubpd %xmm12, %xmm13,%xmm13
-        vaddsubpd %xmm14, %xmm15,%xmm15
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-        vmovapd   %xmm13, %xmm12
-        vmovapd   %xmm15, %xmm14
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $0x01, %xmm15, %xmm15, %xmm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-        vmulpd  %xmm12, %xmm0, %xmm12
-        vmulpd  %xmm14, %xmm0, %xmm14
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-        vmulpd  %xmm13, %xmm1, %xmm13
-        vmulpd  %xmm15, %xmm1, %xmm15
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	prefetcht0	B_PR1+64(BO,BI,SIZE)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubpd %xmm8, %xmm9, %xmm9
-        vaddsubpd %xmm10,%xmm11, %xmm11
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$1, I			// i = (m >> 1)
-	je	.L1_40
-
-	ALIGN_4
-
-.L1_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL2x1_SUB(xxx)
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8  , %xmm8
-        vaddsubpd %xmm13,%xmm12 , %xmm12
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $0x01, %xmm12, %xmm12, %xmm13
-
-#else
-        vaddsubpd %xmm8, %xmm9 , %xmm9
-        vaddsubpd %xmm12,%xmm13, %xmm13
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm13, %xmm12
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $0x01, %xmm13, %xmm13, %xmm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm12, %xmm0, %xmm12
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm13, %xmm1, %xmm13
-
-	vaddsubpd %xmm9,  %xmm8 , %xmm8
-        vaddsubpd %xmm13, %xmm12, %xmm12
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8,  %xmm8
-
-        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubpd %xmm8, %xmm9,  %xmm9
-
-        vmovapd   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-
-	vaddsubpd %xmm9 ,%xmm8,  %xmm8
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/*********************************************************************
+*
+* 2014/06/28 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2013/10/30 Saar
+*
+* Parameter:
+*       UNROLL_M        2
+*       UNROLL_N        2
+*       ZGEMM_P         384
+*       ZGEMM_Q         168
+*       A_PR1           512
+*       B_PR1           256
+*
+* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
+* 
+* 3456x3456     82.4    GFLOPS with 8 threads on 4 modules (ACML:  76.3 ) (BULLDOZER:  81.0 )
+* 3456x3456     79.9    GFLOPS with 4 threads on 4 modules (ACML:  69.9 ) (BULLDOZER:  74.6 )
+* 3456x3456     40.4    GFLOPS with 2 threads on 2 modules (ACML:  35.8 ) (BULLDOZER:  37.9 )
+* 3456x3456     20.3    GFLOPS with 1 threads on 1 modules (ACML:  18.1 ) (BULLDOZER:  19.2 )
+*
+* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
+* 
+* 6912x6912    227.5    GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 )
+* 6912x6912    211.6    GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 )
+* 6912x6912    123.5    GFLOPS with  8 threads on  8 modules (ACML:  92.7 ) (BULLDOZER: 117.0 )
+* 3456x3456     64.1    GFLOPS with  4 threads on  4 modules (ACML:  49.1 ) (BULLDOZER:  61.7 )
+* 3456x3456     33.4    GFLOPS with  2 threads on  2 modules (ACML:  28.1 ) (BULLDOZER:  30.9 )
+* 3456x3456     17.0    GFLOPS with  1 threads on  1 modules (ACML:  15.2 ) (BULLDOZER:  15.7 )
+*
+*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 320
+
+#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
+#define OLD_A           48 + STACKSIZE(%rsp)
+#define OLD_B           56 + STACKSIZE(%rsp)
+#define OLD_C           64 + STACKSIZE(%rsp)
+#define OLD_LDC         72 + STACKSIZE(%rsp)
+#define OLD_OFFSET      80 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 256*8*4
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA_R  48(%rsp)
+#define ALPHA_I  56(%rsp)
+#define OFFSET   64(%rsp)
+#define KK       72(%rsp)
+#define KKK      80(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define VFMADD_R    vfmaddpd
+#define VFMADD_I    vfmaddpd
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define VFMADD_R    vfnmaddpd
+#define VFMADD_I    vfmaddpd
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define VFMADD_R    vfmaddpd
+#define VFMADD_I    vfnmaddpd
+#else
+#define VFMADD_R    vfnmaddpd
+#define VFMADD_I    vfnmaddpd
+#endif
+
+
+#define	A_PR1	512
+#define	B_PR1	256
+
+#define KERNEL2x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_2(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL2x2_4(xx) \
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $16, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x2_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $4, BI                            ;\
+        addq    $4, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL1x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_2(xx) \
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_3(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_4(xx) \
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $16, BI                            ;\
+        addq    $8 , %rax                          ;\
+
+
+#define KERNEL1x2_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $4, BI                            ;\
+        addq    $2, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL2x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_2(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           2 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL2x1_4(xx) \
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $8, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x1_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $2, BI                            ;\
+        addq    $4, %rax                          ;\
+
+
+/************************************************************************************************/
+
+#define KERNEL1x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_2(xx) \
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_3(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_4(xx) \
+        vmovups          -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $8, BI                            ;\
+        addq    $8, %rax                          ;\
+
+
+#define KERNEL1x1_SUB(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $2, BI                            ;\
+        addq    $2, %rax                          ;\
+
+
+/************************************************************************************************/
+
+
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	vmovsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA_R
+	vmovsd	 %xmm1, ALPHA_I
+
+	salq	$ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+.L2_0:
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	addq	$4*SIZE,BO1
+	addq	$4*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$1, I			// i = (m >> 1)
+	je	.L2_40
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL2x2_SUB(xxx)
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $0x01, %xmm15, %xmm15, %xmm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $0x01, %xmm12, %xmm12, %xmm13
+        vshufpd $0x01, %xmm14, %xmm14, %xmm15
+
+#else
+        vaddsubpd %xmm8,  %xmm9 ,%xmm9
+        vaddsubpd %xmm10, %xmm11,%xmm11
+        vaddsubpd %xmm12, %xmm13,%xmm13
+        vaddsubpd %xmm14, %xmm15,%xmm15
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm13, %xmm12
+        vmovapd   %xmm15, %xmm14
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $0x01, %xmm15, %xmm15, %xmm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm12, %xmm0, %xmm12
+        vmulpd  %xmm14, %xmm0, %xmm14
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm13, %xmm1, %xmm13
+        vmulpd  %xmm15, %xmm1, %xmm15
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	prefetcht0	B_PR1+64(BO,BI,SIZE)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubpd %xmm8, %xmm9, %xmm9
+        vaddsubpd %xmm10,%xmm11, %xmm11
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$1, I			// i = (m >> 1)
+	je	.L1_40
+
+	ALIGN_4
+
+.L1_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL2x1_SUB(xxx)
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8  , %xmm8
+        vaddsubpd %xmm13,%xmm12 , %xmm12
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $0x01, %xmm12, %xmm12, %xmm13
+
+#else
+        vaddsubpd %xmm8, %xmm9 , %xmm9
+        vaddsubpd %xmm12,%xmm13, %xmm13
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm13, %xmm12
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $0x01, %xmm13, %xmm13, %xmm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm12, %xmm0, %xmm12
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm13, %xmm1, %xmm13
+
+	vaddsubpd %xmm9,  %xmm8 , %xmm8
+        vaddsubpd %xmm13, %xmm12, %xmm12
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8,  %xmm8
+
+        vshufpd $0x01, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubpd %xmm8, %xmm9,  %xmm9
+
+        vmovapd   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufpd $0x01, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+
+	vaddsubpd %xmm9 ,%xmm8,  %xmm8
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.S b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
index f91bfa89b..29729b101 100644
--- a/kernel/x86_64/zgemm_kernel_4x2_haswell.S
+++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.S
@@ -1,3881 +1,3881 @@
-/*********************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************************/
-
-/********************************************************************************
-* 2014/07/28 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-* 2013/10/28 Saar
-* Parameter:
-*       ZGEMM_DEFAULT_UNROLL_N  2
-*       ZGEMM_DEFAULT_UNROLL_M  4 
-*       ZGEMM_DEFAULT_P         256
-*       ZGEMM_DEFAULT_Q         128
-*	A_PR1			512
-*	B_PR1			512
-*
-* 2014/07/28 Saar
-* Performance at 4608x4608x4608:
-*       1 thread:       53 GFLOPS	(SANDYBRIDGE:  29)	(MKL:   53)
-*       2 threads:     101 GFLOPS	(SANDYBRIDGE:  59)	(MKL:  100)
-*       3 threads:     146 GFLOPS	(SANDYBRIDGE:  86)	(MKL:  138)
-*       4 threads:     184 GFLOPS	(SANDYBRIDGE: 108)	(MKL:  172)
-*
-********************************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 320
-
-#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
-#define OLD_A           48 + STACKSIZE(%rsp)
-#define OLD_B           56 + STACKSIZE(%rsp)
-#define OLD_C           64 + STACKSIZE(%rsp)
-#define OLD_LDC         72 + STACKSIZE(%rsp)
-#define OLD_OFFSET      80 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA_R  48(%rsp)
-#define ALPHA_I  56(%rsp)
-#define OFFSET   64(%rsp)
-#define KK       72(%rsp)
-#define KKK      80(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 4(%rsp);\
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-#if defined(BULLDOZER) 
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
-
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
-
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
-
-#else
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
-
-#endif
-
-#else
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
-
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
-
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
-
-#else
-
-#define	VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
-
-#define	VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
-
-#endif
-
-#endif
-
-#define	A_PR1	512
-#define	B_PR1	512
-
-
-
-/***************************************************************************************************/
-
-.macro KERNEL4x3_SUB
-        vmovups                   (AO), %ymm0
-        vmovups           4 * SIZE(AO), %ymm1
-	prefetcht0	  A_PR1(AO)
-
-        vbroadcastsd              (BO),   %ymm2
-        vbroadcastsd      1 * SIZE(BO),   %ymm3
-        VFMADDPD_R(        %ymm8 ,%ymm2,%ymm0 )
-        VFMADDPD_R(        %ymm12,%ymm2,%ymm1 )
-        VFMADDPD_I(        %ymm9 ,%ymm3,%ymm0 )
-        VFMADDPD_I(        %ymm13,%ymm3,%ymm1 )
-
-        vbroadcastsd      2 * SIZE(BO),   %ymm2
-        vbroadcastsd      3 * SIZE(BO),   %ymm3
-        VFMADDPD_R(        %ymm10,%ymm2,%ymm0 )
-        VFMADDPD_R(        %ymm14,%ymm2,%ymm1 )
-        VFMADDPD_I(        %ymm11,%ymm3,%ymm0 )
-        VFMADDPD_I(        %ymm15,%ymm3,%ymm1 )
-
-        vbroadcastsd      4 * SIZE(BO),   %ymm2
-        vbroadcastsd      5 * SIZE(BO),   %ymm3
-        VFMADDPD_R(        %ymm4 ,%ymm2,%ymm0 )
-        VFMADDPD_R(        %ymm6 ,%ymm2,%ymm1 )
-        VFMADDPD_I(        %ymm5 ,%ymm3,%ymm0 )
-        VFMADDPD_I(        %ymm7 ,%ymm3,%ymm1 )
-
-        addq    $ 6*SIZE, BO                           
-        addq    $ 8*SIZE, AO                           
-        decq	%rax                         
-.endm
-
-.macro SAVE4x3
-
-	vbroadcastsd	ALPHA_R, %ymm0
-	vbroadcastsd	ALPHA_I, %ymm1
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
-        vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
-        vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %ymm9, %ymm8 , %ymm8
-        vaddsubpd %ymm11,%ymm10, %ymm10
-        vaddsubpd %ymm13,%ymm12, %ymm12
-        vaddsubpd %ymm15,%ymm14, %ymm14
-        vaddsubpd %ymm5 ,%ymm4 , %ymm4
-        vaddsubpd %ymm7 ,%ymm6 , %ymm6
-
-        vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9
-        vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
-        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
-        vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
-        vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5
-        vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7
-
-#else
-        vaddsubpd %ymm8,  %ymm9 ,%ymm9
-        vaddsubpd %ymm10, %ymm11,%ymm11
-        vaddsubpd %ymm12, %ymm13,%ymm13
-        vaddsubpd %ymm14, %ymm15,%ymm15
-        vaddsubpd %ymm4 , %ymm5 ,%ymm5
-        vaddsubpd %ymm6 , %ymm7 ,%ymm7
-
-        vmovapd   %ymm9,  %ymm8
-        vmovapd   %ymm11, %ymm10
-        vmovapd   %ymm13, %ymm12
-        vmovapd   %ymm15, %ymm14
-        vmovapd   %ymm5 , %ymm4
-        vmovapd   %ymm7 , %ymm6
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
-        vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
-        vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %ymm8 , %ymm0, %ymm8
-        vmulpd  %ymm10, %ymm0, %ymm10
-        vmulpd  %ymm12, %ymm0, %ymm12
-        vmulpd  %ymm14, %ymm0, %ymm14
-        vmulpd  %ymm4 , %ymm0, %ymm4
-        vmulpd  %ymm6 , %ymm0, %ymm6
-
-	// multiply with ALPHA_I
-        vmulpd  %ymm9 , %ymm1, %ymm9
-        vmulpd  %ymm11, %ymm1, %ymm11
-        vmulpd  %ymm13, %ymm1, %ymm13
-        vmulpd  %ymm15, %ymm1, %ymm15
-        vmulpd  %ymm5 , %ymm1, %ymm5
-        vmulpd  %ymm7 , %ymm1, %ymm7
-
-	vaddsubpd %ymm9, %ymm8 , %ymm8
-        vaddsubpd %ymm11,%ymm10, %ymm10
-        vaddsubpd %ymm13,%ymm12, %ymm12
-        vaddsubpd %ymm15,%ymm14, %ymm14
-        vaddsubpd %ymm5 ,%ymm4 , %ymm4
-        vaddsubpd %ymm7 ,%ymm6 , %ymm6
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %ymm8 , %ymm8
-	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
-
-	vaddpd 	 	(CO1, LDC), %ymm10, %ymm10
-	vaddpd  4 * SIZE(CO1, LDC), %ymm14, %ymm14
-
-	vaddpd 	 	(CO1, LDC,2), %ymm4 , %ymm4
-	vaddpd  4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6
-#endif
-
-	vmovups	%ymm8 ,  	 (CO1)
-	vmovups	%ymm12 , 4 * SIZE(CO1)
-
-	vmovups	%ymm10 ,  	 (CO1, LDC)
-	vmovups	%ymm14 , 4 * SIZE(CO1, LDC)
-
-	vmovups	%ymm4  ,  	 (CO1, LDC, 2)
-	vmovups	%ymm6  , 4 * SIZE(CO1, LDC, 2)
-
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1, LDC)
-
-.endm
-
-
-
-/***************************************************************************************************/
-
-.macro KERNEL2x3_SUB
-        vmovups                  (AO), %xmm0
-        vmovups          2 * SIZE(AO), %xmm1
-        vmovddup                 (BO), %xmm2
-        vmovddup         1 * SIZE(BO), %xmm3
-
-        VFMADDPD_R(        %xmm8 ,%xmm2,%xmm0 )
-        VFMADDPD_R(        %xmm12,%xmm2,%xmm1 )
-        VFMADDPD_I(        %xmm9 ,%xmm3,%xmm0 )
-        VFMADDPD_I(        %xmm13,%xmm3,%xmm1 )
-
-        vmovddup         2 * SIZE(BO), %xmm2
-        vmovddup         3 * SIZE(BO), %xmm3
-        VFMADDPD_R(        %xmm10,%xmm2,%xmm0 )
-        VFMADDPD_R(        %xmm14,%xmm2,%xmm1 )
-        VFMADDPD_I(        %xmm11,%xmm3,%xmm0 )
-        VFMADDPD_I(        %xmm15,%xmm3,%xmm1 )
-
-        vmovddup         4 * SIZE(BO), %xmm2
-        vmovddup         5 * SIZE(BO), %xmm3
-        VFMADDPD_R(        %xmm4 ,%xmm2,%xmm0 )
-        VFMADDPD_R(        %xmm6 ,%xmm2,%xmm1 )
-        VFMADDPD_I(        %xmm5 ,%xmm3,%xmm0 )
-        VFMADDPD_I(        %xmm7 ,%xmm3,%xmm1 )
-
-        addq    $ 6*SIZE, BO                           
-        addq    $ 4*SIZE, AO                           
-        decq    %rax                         
-.endm
-
-.macro SAVE2x3
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
-        vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5
-        vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-        vaddsubpd %xmm5, %xmm4 , %xmm4
-        vaddsubpd %xmm7, %xmm6 , %xmm6
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
-        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
-        vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
-        vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
-        vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7
-
-#else
-        vaddsubpd %xmm8,  %xmm9 ,%xmm9
-        vaddsubpd %xmm10, %xmm11,%xmm11
-        vaddsubpd %xmm12, %xmm13,%xmm13
-        vaddsubpd %xmm14, %xmm15,%xmm15
-        vaddsubpd %xmm4,  %xmm5 ,%xmm5
-        vaddsubpd %xmm6,  %xmm7 ,%xmm7
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-        vmovapd   %xmm13, %xmm12
-        vmovapd   %xmm15, %xmm14
-        vmovapd   %xmm5,  %xmm4
-        vmovapd   %xmm7,  %xmm6
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
-        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
-        vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-        vmulpd  %xmm12, %xmm0, %xmm12
-        vmulpd  %xmm14, %xmm0, %xmm14
-        vmulpd  %xmm4 , %xmm0, %xmm4
-        vmulpd  %xmm6 , %xmm0, %xmm6
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-        vmulpd  %xmm13, %xmm1, %xmm13
-        vmulpd  %xmm15, %xmm1, %xmm15
-        vmulpd  %xmm5 , %xmm1, %xmm5
-        vmulpd  %xmm7 , %xmm1, %xmm7
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-	vaddsubpd %xmm5, %xmm4 , %xmm4
-	vaddsubpd %xmm7, %xmm6 , %xmm6
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-	vaddpd 	 	(CO1, LDC,2), %xmm4 , %xmm4
-	vaddpd  2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
-
-	vmovups	%xmm4  ,  	(CO1, LDC,2)
-	vmovups	%xmm6  , 2 * SIZE(CO1, LDC,2)
-
-.endm
-
-
-/************************************************************************************************/
-
-
-.macro KERNEL1x3_SUB
-        vmovups                  (AO), %xmm0
-        vmovddup                 (BO), %xmm2
-        vmovddup         1 * SIZE(BO), %xmm3
-
-        VFMADDPD_R(        %xmm8,%xmm2,%xmm0 )
-        VFMADDPD_I(        %xmm9,%xmm3,%xmm0 )
-
-        vmovddup         2 * SIZE(BO), %xmm2
-        vmovddup         3 * SIZE(BO), %xmm3
-        VFMADDPD_R(        %xmm10,%xmm2,%xmm0 )
-        VFMADDPD_I(        %xmm11,%xmm3,%xmm0 )
-
-        vmovddup         4 * SIZE(BO), %xmm2
-        vmovddup         5 * SIZE(BO), %xmm3
-        VFMADDPD_R(        %xmm4 ,%xmm2,%xmm0 )
-        VFMADDPD_I(        %xmm5 ,%xmm3,%xmm0 )
-
-        addq    $ 6*SIZE, BO                           
-        addq    $ 2*SIZE, AO                           
-        decq    %rax                         
-.endm
-
-.macro SAVE1x3
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm5, %xmm4 , %xmm4
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
-        vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
-
-#else
-        vaddsubpd %xmm8, %xmm9, %xmm9
-        vaddsubpd %xmm10,%xmm11, %xmm11
-        vaddsubpd %xmm4, %xmm5, %xmm5
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-        vmovapd   %xmm5,  %xmm4
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-        vmulpd  %xmm4 , %xmm0, %xmm4
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-        vmulpd  %xmm5 , %xmm1, %xmm5
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-	vaddsubpd %xmm5, %xmm4 , %xmm4
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1)        , %xmm8 , %xmm8
-	vaddpd 	 	(CO1, LDC)   , %xmm10, %xmm10
-	vaddpd 	 	(CO1, LDC,2) , %xmm4 , %xmm4
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm4  ,  	(CO1, LDC,2)
-
-.endm
-
-
-
-
-/***************************************************************************************************/
-
-.macro KERNEL4x2_SUB
-        vmovups           -8 * SIZE(AO, %rax, SIZE), %ymm0
-        vmovups           -4 * SIZE(AO, %rax, SIZE), %ymm1
-
-        vbroadcastsd      -8 * SIZE(BO, BI, SIZE),   %ymm4
-        vbroadcastsd      -7 * SIZE(BO, BI, SIZE),   %ymm5
-        VFMADDPD_R(        %ymm8 ,%ymm4,%ymm0 )
-        VFMADDPD_R(        %ymm12,%ymm4,%ymm1 )
-        vbroadcastsd      -6 * SIZE(BO, BI, SIZE),   %ymm6
-        VFMADDPD_I(        %ymm9 ,%ymm5,%ymm0 )
-        VFMADDPD_I(        %ymm13,%ymm5,%ymm1 )
-        vbroadcastsd      -5 * SIZE(BO, BI, SIZE),   %ymm7
-        VFMADDPD_R(        %ymm10,%ymm6,%ymm0 )
-        VFMADDPD_R(        %ymm14,%ymm6,%ymm1 )
-        VFMADDPD_I(        %ymm11,%ymm7,%ymm0 )
-        VFMADDPD_I(        %ymm15,%ymm7,%ymm1 )
-
-        addq    $ 4, BI                           
-        addq    $ 8, %rax                         
-.endm
-
-.macro SAVE4x2
-
-	vbroadcastsd	ALPHA_R, %ymm0
-	vbroadcastsd	ALPHA_I, %ymm1
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %ymm9, %ymm8 , %ymm8
-        vaddsubpd %ymm11,%ymm10, %ymm10
-        vaddsubpd %ymm13,%ymm12, %ymm12
-        vaddsubpd %ymm15,%ymm14, %ymm14
-
-        vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
-        vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
-        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
-        vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
-
-#else
-        vaddsubpd %ymm8,  %ymm9 ,%ymm9
-        vaddsubpd %ymm10, %ymm11,%ymm11
-        vaddsubpd %ymm12, %ymm13,%ymm13
-        vaddsubpd %ymm14, %ymm15,%ymm15
-
-        vmovapd   %ymm9,  %ymm8
-        vmovapd   %ymm11, %ymm10
-        vmovapd   %ymm13, %ymm12
-        vmovapd   %ymm15, %ymm14
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %ymm8 , %ymm0, %ymm8
-        vmulpd  %ymm10, %ymm0, %ymm10
-        vmulpd  %ymm12, %ymm0, %ymm12
-        vmulpd  %ymm14, %ymm0, %ymm14
-
-	// multiply with ALPHA_I
-        vmulpd  %ymm9 , %ymm1, %ymm9
-        vmulpd  %ymm11, %ymm1, %ymm11
-        vmulpd  %ymm13, %ymm1, %ymm13
-        vmulpd  %ymm15, %ymm1, %ymm15
-
-	vaddsubpd %ymm9, %ymm8 , %ymm8
-        vaddsubpd %ymm11,%ymm10, %ymm10
-        vaddsubpd %ymm13,%ymm12, %ymm12
-        vaddsubpd %ymm15,%ymm14, %ymm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %ymm8 , %ymm8
-	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
-
-	vaddpd 	 	(CO1, LDC), %ymm10, %ymm10
-	vaddpd  4 * SIZE(CO1, LDC), %ymm14, %ymm14
-
-#endif
-
-	vmovups	%ymm8 ,  	(CO1)
-	vmovups	%ymm12 , 4 * SIZE(CO1)
-
-	vmovups	%ymm10 ,  	(CO1, LDC)
-	vmovups	%ymm14 , 4 * SIZE(CO1, LDC)
-
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1, LDC)
-
-.endm
-
-/***************************************************************************************************/
-
-.macro KERNEL2x2_SUB
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1
-        VFMADDPD_R(        %xmm8,%xmm4,%xmm0  )
-        VFMADDPD_R(        %xmm12,%xmm4,%xmm1 )
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPD_I(        %xmm9,%xmm5,%xmm0  )
-        VFMADDPD_I(        %xmm13,%xmm5,%xmm1 )
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6
-        VFMADDPD_R(        %xmm10,%xmm6,%xmm0 )
-        VFMADDPD_R(        %xmm14,%xmm6,%xmm1 )
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7
-        VFMADDPD_I(        %xmm11,%xmm7,%xmm0 )
-        VFMADDPD_I(        %xmm15,%xmm7,%xmm1 )
-        addq    $ 4, BI                           
-        addq    $ 4, %rax                         
-.endm
-
-.macro SAVE2x2
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
-        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
-        vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
-
-#else
-        vaddsubpd %xmm8,  %xmm9 ,%xmm9
-        vaddsubpd %xmm10, %xmm11,%xmm11
-        vaddsubpd %xmm12, %xmm13,%xmm13
-        vaddsubpd %xmm14, %xmm15,%xmm15
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-        vmovapd   %xmm13, %xmm12
-        vmovapd   %xmm15, %xmm14
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-        vmulpd  %xmm12, %xmm0, %xmm12
-        vmulpd  %xmm14, %xmm0, %xmm14
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-        vmulpd  %xmm13, %xmm1, %xmm13
-        vmulpd  %xmm15, %xmm1, %xmm15
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-        vaddsubpd %xmm13,%xmm12, %xmm12
-        vaddsubpd %xmm15,%xmm14, %xmm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
-
-.endm
-
-/************************************************************************************************/
-
-/************************************************************************************************/
-
-
-.macro KERNEL1x2_SUB
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
-        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4
-        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPD_R(        %xmm8,%xmm4,%xmm0 )
-        VFMADDPD_I(        %xmm9,%xmm5,%xmm0 )
-        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6
-        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7
-        VFMADDPD_R(        %xmm10,%xmm6,%xmm0 )
-        VFMADDPD_I(        %xmm11,%xmm7,%xmm0 )
-        addq    $ 4, BI                           
-        addq    $ 2, %rax                         
-.endm
-
-.macro SAVE1x2
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubpd %xmm8, %xmm9, %xmm9
-        vaddsubpd %xmm10,%xmm11, %xmm11
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm11, %xmm1, %xmm11
-
-	vaddsubpd %xmm9, %xmm8 , %xmm8
-        vaddsubpd %xmm11,%xmm10, %xmm10
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-.endm
-
-
-/************************************************************************************************/
-
-.macro KERNEL4x1_SUB
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm0
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %ymm1
-        vbroadcastsd     -4 * SIZE(BO, BI, SIZE)  , %ymm4
-        vbroadcastsd     -3 * SIZE(BO, BI, SIZE)  , %ymm5
-        VFMADDPD_R(        %ymm8 ,%ymm4,%ymm0 )
-        VFMADDPD_R(        %ymm12,%ymm4,%ymm1 )
-        VFMADDPD_I(        %ymm9 ,%ymm5,%ymm0 )
-        VFMADDPD_I(        %ymm13,%ymm5,%ymm1 )
-
-        addq    $ 2, BI                           
-        addq    $ 8, %rax                         
-.endm
-
-.macro SAVE4x1
-
-	vbroadcastsd	ALPHA_R, %ymm0
-	vbroadcastsd	ALPHA_I, %ymm1
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %ymm9, %ymm8  , %ymm8
-        vaddsubpd %ymm13,%ymm12 , %ymm12
-
-        vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
-        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
-
-#else
-        vaddsubpd %ymm8, %ymm9 , %ymm9
-        vaddsubpd %ymm12,%ymm13, %ymm13
-
-        vmovapd   %ymm9,  %ymm8
-        vmovapd   %ymm13, %ymm12
-
-	// swap high and low 8 bytes
-        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
-        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %ymm8 , %ymm0, %ymm8
-        vmulpd  %ymm12, %ymm0, %ymm12
-
-	// multiply with ALPHA_I
-        vmulpd  %ymm9 , %ymm1, %ymm9
-        vmulpd  %ymm13, %ymm1, %ymm13
-
-	vaddsubpd %ymm9,  %ymm8 , %ymm8
-        vaddsubpd %ymm13, %ymm12, %ymm12
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %ymm8 , %ymm8
-	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
-
-#endif
-
-	vmovups	%ymm8 ,  	(CO1)
-	vmovups	%ymm12 ,4 * SIZE(CO1)
-
-.endm
-
-
-
-/************************************************************************************************/
-
-.macro KERNEL2x1_SUB
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPD_R(        %xmm8,%xmm4,%xmm0  )
-        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1
-        VFMADDPD_R(        %xmm12,%xmm4,%xmm1 )
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPD_I(        %xmm9,%xmm5,%xmm0  )
-        VFMADDPD_I(        %xmm13,%xmm5,%xmm1 )
-        addq    $ 2, BI                           
-        addq    $ 4, %rax                         
-.endm
-
-.macro SAVE2x1
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8  , %xmm8
-        vaddsubpd %xmm13,%xmm12 , %xmm12
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
-
-#else
-        vaddsubpd %xmm8, %xmm9 , %xmm9
-        vaddsubpd %xmm12,%xmm13, %xmm13
-
-        vmovapd   %xmm9,  %xmm8
-        vmovapd   %xmm13, %xmm12
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-        vmulpd  %xmm12, %xmm0, %xmm12
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-        vmulpd  %xmm13, %xmm1, %xmm13
-
-	vaddsubpd %xmm9,  %xmm8 , %xmm8
-        vaddsubpd %xmm13, %xmm12, %xmm12
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 2 * SIZE(CO1)
-
-.endm
-
-
-/************************************************************************************************/
-
-.macro KERNEL1x1_SUB
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
-        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPD_R(        %xmm8,%xmm4,%xmm0 )
-        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPD_I(        %xmm9,%xmm5,%xmm0 )
-        addq    $ 2, BI                           
-        addq    $ 2, %rax                         
-.endm
-
-.macro SAVE1x1
-
-	vmovddup	ALPHA_R, %xmm0
-	vmovddup	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubpd %xmm9, %xmm8,  %xmm8
-
-        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubpd %xmm8, %xmm9,  %xmm9
-
-        vmovapd   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulpd  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulpd  %xmm9 , %xmm1, %xmm9
-
-	vaddsubpd %xmm9 ,%xmm8,  %xmm8
-
-#ifndef TRMMKERNEL
-
-	vaddpd 	 	(CO1), %xmm8 , %xmm8
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-.endm
-
-
-/************************************************************************************************/
-
-
-
-#if !defined(TRMMKERNEL)
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$ STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $ 128 + L_BUFFER_SIZE, %rsp
-        andq    $ -4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$ 0, OLD_M
-	je	.L999
-
-	cmpq	$ 0, OLD_N
-	je	.L999
-
-	cmpq	$ 0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA_R
-	vmovsd	 %xmm1, ALPHA_I
-
-	salq	$ ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $ 6,  %rdi
-        divq    %rdi                    //    N / 6
-        movq    %rax, Ndiv6             //    N / 6
-        movq    %rdx, Nmod6             //    N % 6
-
-	
-
-/************************************************************************************************/
-.L6_00_0:
-
-	movq	Ndiv6,  J
-	cmpq	$ 0, J
-	je	.L2_00_0
-	ALIGN_4
-
-
-
-.L6_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	salq	$2, %rax		// 2 * COMPSIZE
-	leaq	(B, %rax,8), BO2 
-	movq	BO2, B			// next offset of B
-	movq	K, %rax
-	ALIGN_4
-
-.L6_00_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm1
-	vmovups	        (BO2), %xmm2
-	vmovups	%xmm0,         (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	vmovups	%xmm2, 4 * SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO2
-	addq	$ 6*SIZE,BO
-	decq	%rax
-	jnz	.L6_00_02b
-
-.L6_00_02c:
-
-
-
-.L6_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	movq	A, AO		 	// aoffset = a
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L6_2_10
-
-	ALIGN_4
-
-/******************************************************************************************************************/
-
-.L6_4_11:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L6_4_16
-	ALIGN_4
-
-.L6_4_12:
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	je	.L6_4_16
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	je	.L6_4_16
-
-	jmp	.L6_4_12
-	ALIGN_4
-
-.L6_4_16:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L6_4_19
-	ALIGN_4
-
-.L6_4_17:
-
-	KERNEL4x3_SUB
-
-	jnz	.L6_4_17
-	ALIGN_4
-
-
-.L6_4_19:
-
-	SAVE4x3
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L6_4_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-
-/******************************************************************************************************************/
-.L6_2_10:
-	testq	$ 2, M		
-	jz	.L6_2_40		// to next 2 lines of N
-
-.L6_2_11:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L6_2_16
-	ALIGN_4
-
-.L6_2_12:
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	je	.L6_2_16
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	je	.L6_2_16
-
-	jmp	.L6_2_12
-	ALIGN_4
-
-.L6_2_16:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L6_2_19
-	ALIGN_4
-
-.L6_2_17:
-
-	KERNEL2x3_SUB
-
-	jnz	.L6_2_17
-	ALIGN_4
-
-
-.L6_2_19:
-
-	SAVE2x3
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L6_2_40:
-	testq	$ 1, M		
-	jz	.L6_2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L6_2_41:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L6_2_46
-
-	ALIGN_4
-
-.L6_2_42:
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	je	.L6_2_46
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	je	.L6_2_46
-
-	jmp	.L6_2_42
-	ALIGN_4
-
-.L6_2_46:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L6_2_49
-
-	ALIGN_4
-
-.L6_2_47:
-
-	KERNEL1x3_SUB
-
-	jnz	.L6_2_47
-	ALIGN_4
-
-
-.L6_2_49:
-
-	SAVE1x3
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L6_2_41
-	ALIGN_4	
-
-
-
-	
-.L6_2_60:
-
-
-/************************************************************************************************/
-
-/************************************************************************************************/
-
-
-.L7_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	salq	$2, %rax		// 2 * COMPSIZE
-	leaq	(B, %rax,8), BO2 
-	movq	K, %rax
-	ALIGN_4
-
-.L7_00_02b:
-
-	vmovups	2 * SIZE(BO1), %xmm0
-	vmovups	        (BO2), %xmm1
-	vmovups	2 * SIZE(BO2), %xmm2
-	vmovups	%xmm0,         (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	vmovups	%xmm2, 4 * SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO2
-	addq	$ 6*SIZE,BO
-	decq	%rax
-	jnz	.L7_00_02b
-
-.L7_00_02c:
-
-	movq	BO2, B			// next offset of B
-
-
-.L7_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	movq	A, AO		 	// aoffset = a
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L7_2_10
-
-	ALIGN_4
-
-/******************************************************************************************************************/
-
-.L7_4_11:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L7_4_16
-	ALIGN_4
-
-.L7_4_12:
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	je	.L7_4_16
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-	KERNEL4x3_SUB
-
-	je	.L7_4_16
-
-	jmp	.L7_4_12
-	ALIGN_4
-
-.L7_4_16:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L7_4_19
-
-	ALIGN_4
-
-.L7_4_17:
-
-	KERNEL4x3_SUB
-
-	jnz	.L7_4_17
-	ALIGN_4
-
-
-.L7_4_19:
-
-	SAVE4x3
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L7_4_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-
-/******************************************************************************************************************/
-.L7_2_10:
-	testq	$ 2, M		
-	jz	.L7_2_40		// to next 2 lines of N
-
-.L7_2_11:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L7_2_16
-	ALIGN_4
-
-.L7_2_12:
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	je	.L7_2_16
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-	KERNEL2x3_SUB
-
-	je	.L7_2_16
-
-	jmp	.L7_2_12
-	ALIGN_4
-
-.L7_2_16:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L7_2_19
-
-	ALIGN_4
-
-.L7_2_17:
-
-	KERNEL2x3_SUB
-
-	jnz	.L7_2_17
-	ALIGN_4
-
-
-.L7_2_19:
-
-	SAVE2x3
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L7_2_40:
-	testq	$ 1, M		
-	jz	.L7_2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L7_2_41:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L7_2_46
-
-	ALIGN_4
-
-.L7_2_42:
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	je	.L7_2_46
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-	KERNEL1x3_SUB
-
-	je	.L7_2_46
-
-	jmp	.L7_2_42
-	ALIGN_4
-
-.L7_2_46:
-        movq    K, %rax
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L7_2_49
-	ALIGN_4
-
-.L7_2_47:
-
-	KERNEL1x3_SUB
-
-	jnz	.L7_2_47
-	ALIGN_4
-
-
-.L7_2_49:
-
-	SAVE1x3
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L7_2_41
-	ALIGN_4	
-
-
-
-	
-.L7_2_60:
-
-	decq	J			// j --
-	jg	.L6_00_01		// next 6 lines of N
-
-/************************************************************************************************/
-
-
-
-/************************************************************************************************/
-.L2_00_0:
-
-	movq	Nmod6,  J
-	sarq	$1, J		// j = j / 2
-	cmpq	$ 0, J
-	je	.L1_2_0
-	ALIGN_4
-
-
-
-.L2_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_00_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm1
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L2_00_02b
-
-.L2_00_02c:
-
-	movq	BO1, B			// next offset of B
-
-
-.L2_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L2_2_10
-
-	ALIGN_4
-
-/******************************************************************************************************************/
-
-.L2_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_12:
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-	jmp	.L2_4_12
-	ALIGN_4
-
-.L2_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_17:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_4_17
-	ALIGN_4
-
-
-.L2_4_19:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L2_4_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-
-/******************************************************************************************************************/
-.L2_2_10:
-	testq	$ 2, M		
-	jz	.L2_2_40		// to next 2 lines of N
-
-.L2_2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_2_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_2_16
-
-	jmp	.L2_2_12
-	ALIGN_4
-
-.L2_2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_17:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_2_17
-	ALIGN_4
-
-
-.L2_2_19:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_2_40:
-	testq	$ 1, M		
-	jz	.L2_2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_2_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_2_46
-
-	jmp	.L2_2_42
-	ALIGN_4
-
-.L2_2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_2_47
-	ALIGN_4
-
-
-.L2_2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L2_2_41
-	ALIGN_4	
-
-
-
-	
-.L2_2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_00_01			// next 2 lines of N
-
-
-
-.L1_2_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$ 1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_00_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$ 2*SIZE,BO1
-	addq	$ 2*SIZE,BO
-	decq	%rax
-	jnz	.L1_00_02b
-
-.L1_00_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L1_2_10
-
-	ALIGN_4
-
-/*******************************************************************************************************/
-
-
-.L1_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_12:
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	jmp	.L1_4_12
-	ALIGN_4
-
-.L1_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_17:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_4_17
-	ALIGN_4
-
-
-.L1_4_19:
-
-	SAVE4x1
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L1_4_11
-	ALIGN_4	
-
-
-
-
-/*******************************************************************************************************/
-.L1_2_10:
-	testq	$ 2, M		
-	jz	.L1_2_40
-
-
-.L1_2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_2_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_2_16
-
-	jmp	.L1_2_12
-	ALIGN_4
-
-.L1_2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_17:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_2_17
-	ALIGN_4
-
-
-.L1_2_19:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_2_40:
-	testq	$ 1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_2_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_2_46
-
-	jmp	.L1_2_42
-	ALIGN_4
-
-.L1_2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_2_47
-	ALIGN_4
-
-
-.L1_2_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L1_2_41
-	ALIGN_4	
-
-
-
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$ STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#else
-/************************************************************************************************
- TRMM Kernel
-************************************************************************************************/
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$ STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $ 128 + L_BUFFER_SIZE, %rsp
-        andq    $ -4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$ 0, OLD_M
-	je	.L999
-
-	cmpq	$ 0, OLD_N
-	je	.L999
-
-	cmpq	$ 0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA_R
-	vmovsd	 %xmm1, ALPHA_I
-
-	salq	$ ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $ 2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-.L2_00_0:
-
-	movq	Ndiv6,  J
-	cmpq	$ 0, J
-	je	.L1_2_0
-	ALIGN_4
-
-
-
-.L2_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_00_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm1
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 2 * SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L2_00_02b
-
-.L2_00_02c:
-
-	movq	BO1, B			// next offset of B
-
-
-.L2_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L2_2_10
-
-	ALIGN_4
-
-/******************************************************************************************************************/
-
-.L2_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_12:
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-        prefetcht0      B_PR1(BO,BI  ,SIZE)
-	KERNEL4x2_SUB
-        prefetcht0      A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-	jmp	.L2_4_12
-	ALIGN_4
-
-.L2_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_17:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_4_17
-	ALIGN_4
-
-
-.L2_4_19:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L2_4_11
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-
-/******************************************************************************************************************/
-.L2_2_10:
-	testq	$ 2, M		
-	jz	.L2_2_40		// to next 2 lines of N
-
-.L2_2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_2_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_2_16
-
-	jmp	.L2_2_12
-	ALIGN_4
-
-.L2_2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_17:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_2_17
-	ALIGN_4
-
-
-.L2_2_19:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_2_40:
-	testq	$ 1, M		
-	jz	.L2_2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_2_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_2_46
-
-	jmp	.L2_2_42
-	ALIGN_4
-
-.L2_2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_2_47
-	ALIGN_4
-
-
-.L2_2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L2_2_41
-	ALIGN_4	
-
-
-
-	
-.L2_2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_00_01			// next 2 lines of N
-
-
-
-.L1_2_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$ 1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_00_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_00_02b:
-
-	vmovups		(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$ 2*SIZE,BO1
-	addq	$ 2*SIZE,BO
-	decq	%rax
-	jnz	.L1_00_02b
-
-.L1_00_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_00_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 8 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 2, I			// i = (m >> 2)
-	je	.L1_2_10
-
-	ALIGN_4
-
-/*******************************************************************************************************/
-
-
-.L1_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_12:
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	jmp	.L1_4_12
-	ALIGN_4
-
-.L1_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_17:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_4_17
-	ALIGN_4
-
-
-.L1_4_19:
-
-	SAVE4x1
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L1_4_11
-	ALIGN_4	
-
-
-
-
-/*******************************************************************************************************/
-.L1_2_10:
-	testq	$ 2, M		
-	jz	.L1_2_40
-
-
-.L1_2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_2_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_2_16
-
-	jmp	.L1_2_12
-	ALIGN_4
-
-.L1_2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_17:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_2_17
-	ALIGN_4
-
-
-.L1_2_19:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-
-	ALIGN_4	
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_2_40:
-	testq	$ 1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_2_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_2_46
-
-	jmp	.L1_2_42
-	ALIGN_4
-
-.L1_2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_2_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_2_47
-	ALIGN_4
-
-
-.L1_2_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L1_2_41
-	ALIGN_4	
-
-
-
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$ STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-#endif
-
-
+/*********************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/********************************************************************************
+* 2014/07/28 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+* 2013/10/28 Saar
+* Parameter:
+*       ZGEMM_DEFAULT_UNROLL_N  2
+*       ZGEMM_DEFAULT_UNROLL_M  4 
+*       ZGEMM_DEFAULT_P         256
+*       ZGEMM_DEFAULT_Q         128
+*	A_PR1			512
+*	B_PR1			512
+*
+* 2014/07/28 Saar
+* Performance at 4608x4608x4608:
+*       1 thread:       53 GFLOPS	(SANDYBRIDGE:  29)	(MKL:   53)
+*       2 threads:     101 GFLOPS	(SANDYBRIDGE:  59)	(MKL:  100)
+*       3 threads:     146 GFLOPS	(SANDYBRIDGE:  86)	(MKL:  138)
+*       4 threads:     184 GFLOPS	(SANDYBRIDGE: 108)	(MKL:  172)
+*
+********************************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 320
+
+#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
+#define OLD_A           48 + STACKSIZE(%rsp)
+#define OLD_B           56 + STACKSIZE(%rsp)
+#define OLD_C           64 + STACKSIZE(%rsp)
+#define OLD_LDC         72 + STACKSIZE(%rsp)
+#define OLD_OFFSET      80 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA_R  48(%rsp)
+#define ALPHA_I  56(%rsp)
+#define OFFSET   64(%rsp)
+#define KK       72(%rsp)
+#define KKK      80(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 4(%rsp);\
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+#if defined(BULLDOZER) 
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
+
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
+
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfmaddpd y0,y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
+
+#else
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfnmaddpd y0,y1,y2,y0
+
+#endif
+
+#else
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
+
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfmadd231pd y1,y2,y0
+
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfmadd231pd y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
+
+#else
+
+#define	VFMADDPD_R( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
+
+#define	VFMADDPD_I( y0,y1,y2 ) vfnmadd231pd y1,y2,y0
+
+#endif
+
+#endif
+
+#define	A_PR1	512
+#define	B_PR1	512
+
+
+
+/***************************************************************************************************/
+
+.macro KERNEL4x3_SUB
+        vmovups                   (AO), %ymm0
+        vmovups           4 * SIZE(AO), %ymm1
+	prefetcht0	  A_PR1(AO)
+
+        vbroadcastsd              (BO),   %ymm2
+        vbroadcastsd      1 * SIZE(BO),   %ymm3
+        VFMADDPD_R(        %ymm8 ,%ymm2,%ymm0 )
+        VFMADDPD_R(        %ymm12,%ymm2,%ymm1 )
+        VFMADDPD_I(        %ymm9 ,%ymm3,%ymm0 )
+        VFMADDPD_I(        %ymm13,%ymm3,%ymm1 )
+
+        vbroadcastsd      2 * SIZE(BO),   %ymm2
+        vbroadcastsd      3 * SIZE(BO),   %ymm3
+        VFMADDPD_R(        %ymm10,%ymm2,%ymm0 )
+        VFMADDPD_R(        %ymm14,%ymm2,%ymm1 )
+        VFMADDPD_I(        %ymm11,%ymm3,%ymm0 )
+        VFMADDPD_I(        %ymm15,%ymm3,%ymm1 )
+
+        vbroadcastsd      4 * SIZE(BO),   %ymm2
+        vbroadcastsd      5 * SIZE(BO),   %ymm3
+        VFMADDPD_R(        %ymm4 ,%ymm2,%ymm0 )
+        VFMADDPD_R(        %ymm6 ,%ymm2,%ymm1 )
+        VFMADDPD_I(        %ymm5 ,%ymm3,%ymm0 )
+        VFMADDPD_I(        %ymm7 ,%ymm3,%ymm1 )
+
+        addq    $ 6*SIZE, BO                           
+        addq    $ 8*SIZE, AO                           
+        decq	%rax                         
+.endm
+
+.macro SAVE4x3
+
+	vbroadcastsd	ALPHA_R, %ymm0
+	vbroadcastsd	ALPHA_I, %ymm1
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
+        vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
+        vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %ymm9, %ymm8 , %ymm8
+        vaddsubpd %ymm11,%ymm10, %ymm10
+        vaddsubpd %ymm13,%ymm12, %ymm12
+        vaddsubpd %ymm15,%ymm14, %ymm14
+        vaddsubpd %ymm5 ,%ymm4 , %ymm4
+        vaddsubpd %ymm7 ,%ymm6 , %ymm6
+
+        vshufpd $ 0x05, %ymm8 , %ymm8 , %ymm9
+        vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
+        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
+        vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
+        vshufpd $ 0x05, %ymm4 , %ymm4 , %ymm5
+        vshufpd $ 0x05, %ymm6 , %ymm6 , %ymm7
+
+#else
+        vaddsubpd %ymm8,  %ymm9 ,%ymm9
+        vaddsubpd %ymm10, %ymm11,%ymm11
+        vaddsubpd %ymm12, %ymm13,%ymm13
+        vaddsubpd %ymm14, %ymm15,%ymm15
+        vaddsubpd %ymm4 , %ymm5 ,%ymm5
+        vaddsubpd %ymm6 , %ymm7 ,%ymm7
+
+        vmovapd   %ymm9,  %ymm8
+        vmovapd   %ymm11, %ymm10
+        vmovapd   %ymm13, %ymm12
+        vmovapd   %ymm15, %ymm14
+        vmovapd   %ymm5 , %ymm4
+        vmovapd   %ymm7 , %ymm6
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
+        vshufpd $ 0x05, %ymm5 , %ymm5 , %ymm5
+        vshufpd $ 0x05, %ymm7 , %ymm7 , %ymm7
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %ymm8 , %ymm0, %ymm8
+        vmulpd  %ymm10, %ymm0, %ymm10
+        vmulpd  %ymm12, %ymm0, %ymm12
+        vmulpd  %ymm14, %ymm0, %ymm14
+        vmulpd  %ymm4 , %ymm0, %ymm4
+        vmulpd  %ymm6 , %ymm0, %ymm6
+
+	// multiply with ALPHA_I
+        vmulpd  %ymm9 , %ymm1, %ymm9
+        vmulpd  %ymm11, %ymm1, %ymm11
+        vmulpd  %ymm13, %ymm1, %ymm13
+        vmulpd  %ymm15, %ymm1, %ymm15
+        vmulpd  %ymm5 , %ymm1, %ymm5
+        vmulpd  %ymm7 , %ymm1, %ymm7
+
+	vaddsubpd %ymm9, %ymm8 , %ymm8
+        vaddsubpd %ymm11,%ymm10, %ymm10
+        vaddsubpd %ymm13,%ymm12, %ymm12
+        vaddsubpd %ymm15,%ymm14, %ymm14
+        vaddsubpd %ymm5 ,%ymm4 , %ymm4
+        vaddsubpd %ymm7 ,%ymm6 , %ymm6
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %ymm8 , %ymm8
+	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
+
+	vaddpd 	 	(CO1, LDC), %ymm10, %ymm10
+	vaddpd  4 * SIZE(CO1, LDC), %ymm14, %ymm14
+
+	vaddpd 	 	(CO1, LDC,2), %ymm4 , %ymm4
+	vaddpd  4 * SIZE(CO1, LDC,2), %ymm6 , %ymm6
+#endif
+
+	vmovups	%ymm8 ,  	 (CO1)
+	vmovups	%ymm12 , 4 * SIZE(CO1)
+
+	vmovups	%ymm10 ,  	 (CO1, LDC)
+	vmovups	%ymm14 , 4 * SIZE(CO1, LDC)
+
+	vmovups	%ymm4  ,  	 (CO1, LDC, 2)
+	vmovups	%ymm6  , 4 * SIZE(CO1, LDC, 2)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+
+.endm
+
+
+
+/***************************************************************************************************/
+
+.macro KERNEL2x3_SUB
+        vmovups                  (AO), %xmm0
+        vmovups          2 * SIZE(AO), %xmm1
+        vmovddup                 (BO), %xmm2
+        vmovddup         1 * SIZE(BO), %xmm3
+
+        VFMADDPD_R(        %xmm8 ,%xmm2,%xmm0 )
+        VFMADDPD_R(        %xmm12,%xmm2,%xmm1 )
+        VFMADDPD_I(        %xmm9 ,%xmm3,%xmm0 )
+        VFMADDPD_I(        %xmm13,%xmm3,%xmm1 )
+
+        vmovddup         2 * SIZE(BO), %xmm2
+        vmovddup         3 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm10,%xmm2,%xmm0 )
+        VFMADDPD_R(        %xmm14,%xmm2,%xmm1 )
+        VFMADDPD_I(        %xmm11,%xmm3,%xmm0 )
+        VFMADDPD_I(        %xmm15,%xmm3,%xmm1 )
+
+        vmovddup         4 * SIZE(BO), %xmm2
+        vmovddup         5 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm4 ,%xmm2,%xmm0 )
+        VFMADDPD_R(        %xmm6 ,%xmm2,%xmm1 )
+        VFMADDPD_I(        %xmm5 ,%xmm3,%xmm0 )
+        VFMADDPD_I(        %xmm7 ,%xmm3,%xmm1 )
+
+        addq    $ 6*SIZE, BO                           
+        addq    $ 4*SIZE, AO                           
+        decq    %rax                         
+.endm
+
+.macro SAVE2x3
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
+        vshufpd $ 0x01, %xmm5 , %xmm5 , %xmm5
+        vshufpd $ 0x01, %xmm7 , %xmm7 , %xmm7
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+        vaddsubpd %xmm5, %xmm4 , %xmm4
+        vaddsubpd %xmm7, %xmm6 , %xmm6
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
+        vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
+        vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
+        vshufpd $ 0x01, %xmm6 , %xmm6, %xmm7
+
+#else
+        vaddsubpd %xmm8,  %xmm9 ,%xmm9
+        vaddsubpd %xmm10, %xmm11,%xmm11
+        vaddsubpd %xmm12, %xmm13,%xmm13
+        vaddsubpd %xmm14, %xmm15,%xmm15
+        vaddsubpd %xmm4,  %xmm5 ,%xmm5
+        vaddsubpd %xmm6,  %xmm7 ,%xmm7
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm13, %xmm12
+        vmovapd   %xmm15, %xmm14
+        vmovapd   %xmm5,  %xmm4
+        vmovapd   %xmm7,  %xmm6
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
+        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
+        vshufpd $ 0x01, %xmm7 , %xmm7, %xmm7
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm12, %xmm0, %xmm12
+        vmulpd  %xmm14, %xmm0, %xmm14
+        vmulpd  %xmm4 , %xmm0, %xmm4
+        vmulpd  %xmm6 , %xmm0, %xmm6
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm13, %xmm1, %xmm13
+        vmulpd  %xmm15, %xmm1, %xmm15
+        vmulpd  %xmm5 , %xmm1, %xmm5
+        vmulpd  %xmm7 , %xmm1, %xmm7
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+	vaddsubpd %xmm5, %xmm4 , %xmm4
+	vaddsubpd %xmm7, %xmm6 , %xmm6
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+	vaddpd 	 	(CO1, LDC,2), %xmm4 , %xmm4
+	vaddpd  2 * SIZE(CO1, LDC,2), %xmm6 , %xmm6
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
+
+	vmovups	%xmm4  ,  	(CO1, LDC,2)
+	vmovups	%xmm6  , 2 * SIZE(CO1, LDC,2)
+
+.endm
+
+
+/************************************************************************************************/
+
+
+.macro KERNEL1x3_SUB
+        vmovups                  (AO), %xmm0
+        vmovddup                 (BO), %xmm2
+        vmovddup         1 * SIZE(BO), %xmm3
+
+        VFMADDPD_R(        %xmm8,%xmm2,%xmm0 )
+        VFMADDPD_I(        %xmm9,%xmm3,%xmm0 )
+
+        vmovddup         2 * SIZE(BO), %xmm2
+        vmovddup         3 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm10,%xmm2,%xmm0 )
+        VFMADDPD_I(        %xmm11,%xmm3,%xmm0 )
+
+        vmovddup         4 * SIZE(BO), %xmm2
+        vmovddup         5 * SIZE(BO), %xmm3
+        VFMADDPD_R(        %xmm4 ,%xmm2,%xmm0 )
+        VFMADDPD_I(        %xmm5 ,%xmm3,%xmm0 )
+
+        addq    $ 6*SIZE, BO                           
+        addq    $ 2*SIZE, AO                           
+        decq    %rax                         
+.endm
+
+.macro SAVE1x3
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm5, %xmm4 , %xmm4
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $ 0x01, %xmm4 , %xmm4, %xmm5
+
+#else
+        vaddsubpd %xmm8, %xmm9, %xmm9
+        vaddsubpd %xmm10,%xmm11, %xmm11
+        vaddsubpd %xmm4, %xmm5, %xmm5
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm5,  %xmm4
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm5 , %xmm5, %xmm5
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm4 , %xmm0, %xmm4
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm5 , %xmm1, %xmm5
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+	vaddsubpd %xmm5, %xmm4 , %xmm4
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1)        , %xmm8 , %xmm8
+	vaddpd 	 	(CO1, LDC)   , %xmm10, %xmm10
+	vaddpd 	 	(CO1, LDC,2) , %xmm4 , %xmm4
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm4  ,  	(CO1, LDC,2)
+
+.endm
+
+
+
+
+/***************************************************************************************************/
+
+.macro KERNEL4x2_SUB
+        vmovups           -8 * SIZE(AO, %rax, SIZE), %ymm0
+        vmovups           -4 * SIZE(AO, %rax, SIZE), %ymm1
+
+        vbroadcastsd      -8 * SIZE(BO, BI, SIZE),   %ymm4
+        vbroadcastsd      -7 * SIZE(BO, BI, SIZE),   %ymm5
+        VFMADDPD_R(        %ymm8 ,%ymm4,%ymm0 )
+        VFMADDPD_R(        %ymm12,%ymm4,%ymm1 )
+        vbroadcastsd      -6 * SIZE(BO, BI, SIZE),   %ymm6
+        VFMADDPD_I(        %ymm9 ,%ymm5,%ymm0 )
+        VFMADDPD_I(        %ymm13,%ymm5,%ymm1 )
+        vbroadcastsd      -5 * SIZE(BO, BI, SIZE),   %ymm7
+        VFMADDPD_R(        %ymm10,%ymm6,%ymm0 )
+        VFMADDPD_R(        %ymm14,%ymm6,%ymm1 )
+        VFMADDPD_I(        %ymm11,%ymm7,%ymm0 )
+        VFMADDPD_I(        %ymm15,%ymm7,%ymm1 )
+
+        addq    $ 4, BI                           
+        addq    $ 8, %rax                         
+.endm
+
+.macro SAVE4x2
+
+	vbroadcastsd	ALPHA_R, %ymm0
+	vbroadcastsd	ALPHA_I, %ymm1
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %ymm9, %ymm8 , %ymm8
+        vaddsubpd %ymm11,%ymm10, %ymm10
+        vaddsubpd %ymm13,%ymm12, %ymm12
+        vaddsubpd %ymm15,%ymm14, %ymm14
+
+        vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
+        vshufpd $ 0x05, %ymm10, %ymm10, %ymm11
+        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
+        vshufpd $ 0x05, %ymm14, %ymm14, %ymm15
+
+#else
+        vaddsubpd %ymm8,  %ymm9 ,%ymm9
+        vaddsubpd %ymm10, %ymm11,%ymm11
+        vaddsubpd %ymm12, %ymm13,%ymm13
+        vaddsubpd %ymm14, %ymm15,%ymm15
+
+        vmovapd   %ymm9,  %ymm8
+        vmovapd   %ymm11, %ymm10
+        vmovapd   %ymm13, %ymm12
+        vmovapd   %ymm15, %ymm14
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm11, %ymm11, %ymm11
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+        vshufpd $ 0x05, %ymm15, %ymm15, %ymm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %ymm8 , %ymm0, %ymm8
+        vmulpd  %ymm10, %ymm0, %ymm10
+        vmulpd  %ymm12, %ymm0, %ymm12
+        vmulpd  %ymm14, %ymm0, %ymm14
+
+	// multiply with ALPHA_I
+        vmulpd  %ymm9 , %ymm1, %ymm9
+        vmulpd  %ymm11, %ymm1, %ymm11
+        vmulpd  %ymm13, %ymm1, %ymm13
+        vmulpd  %ymm15, %ymm1, %ymm15
+
+	vaddsubpd %ymm9, %ymm8 , %ymm8
+        vaddsubpd %ymm11,%ymm10, %ymm10
+        vaddsubpd %ymm13,%ymm12, %ymm12
+        vaddsubpd %ymm15,%ymm14, %ymm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %ymm8 , %ymm8
+	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
+
+	vaddpd 	 	(CO1, LDC), %ymm10, %ymm10
+	vaddpd  4 * SIZE(CO1, LDC), %ymm14, %ymm14
+
+#endif
+
+	vmovups	%ymm8 ,  	(CO1)
+	vmovups	%ymm12 , 4 * SIZE(CO1)
+
+	vmovups	%ymm10 ,  	(CO1, LDC)
+	vmovups	%ymm14 , 4 * SIZE(CO1, LDC)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+
+.endm
+
+/***************************************************************************************************/
+
+.macro KERNEL2x2_SUB
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1
+        VFMADDPD_R(        %xmm8,%xmm4,%xmm0  )
+        VFMADDPD_R(        %xmm12,%xmm4,%xmm1 )
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPD_I(        %xmm9,%xmm5,%xmm0  )
+        VFMADDPD_I(        %xmm13,%xmm5,%xmm1 )
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6
+        VFMADDPD_R(        %xmm10,%xmm6,%xmm0 )
+        VFMADDPD_R(        %xmm14,%xmm6,%xmm1 )
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7
+        VFMADDPD_I(        %xmm11,%xmm7,%xmm0 )
+        VFMADDPD_I(        %xmm15,%xmm7,%xmm1 )
+        addq    $ 4, BI                           
+        addq    $ 4, %rax                         
+.endm
+
+.macro SAVE2x2
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
+        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
+        vshufpd $ 0x01, %xmm14, %xmm14, %xmm15
+
+#else
+        vaddsubpd %xmm8,  %xmm9 ,%xmm9
+        vaddsubpd %xmm10, %xmm11,%xmm11
+        vaddsubpd %xmm12, %xmm13,%xmm13
+        vaddsubpd %xmm14, %xmm15,%xmm15
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+        vmovapd   %xmm13, %xmm12
+        vmovapd   %xmm15, %xmm14
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+        vshufpd $ 0x01, %xmm15, %xmm15, %xmm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+        vmulpd  %xmm12, %xmm0, %xmm12
+        vmulpd  %xmm14, %xmm0, %xmm14
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+        vmulpd  %xmm13, %xmm1, %xmm13
+        vmulpd  %xmm15, %xmm1, %xmm15
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+        vaddsubpd %xmm13,%xmm12, %xmm12
+        vaddsubpd %xmm15,%xmm14, %xmm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddpd  2 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 2 * SIZE(CO1, LDC)
+
+.endm
+
+/************************************************************************************************/
+
+/************************************************************************************************/
+
+
+.macro KERNEL1x2_SUB
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
+        vmovddup         -8 * SIZE(BO, BI, SIZE), %xmm4
+        vmovddup         -7 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPD_R(        %xmm8,%xmm4,%xmm0 )
+        VFMADDPD_I(        %xmm9,%xmm5,%xmm0 )
+        vmovddup         -6 * SIZE(BO, BI, SIZE), %xmm6
+        vmovddup         -5 * SIZE(BO, BI, SIZE), %xmm7
+        VFMADDPD_R(        %xmm10,%xmm6,%xmm0 )
+        VFMADDPD_I(        %xmm11,%xmm7,%xmm0 )
+        addq    $ 4, BI                           
+        addq    $ 2, %rax                         
+.endm
+
+.macro SAVE1x2
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubpd %xmm8, %xmm9, %xmm9
+        vaddsubpd %xmm10,%xmm11, %xmm11
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm11, %xmm1, %xmm11
+
+	vaddsubpd %xmm9, %xmm8 , %xmm8
+        vaddsubpd %xmm11,%xmm10, %xmm10
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+.endm
+
+
+/************************************************************************************************/
+
+.macro KERNEL4x1_SUB
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm0
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %ymm1
+        vbroadcastsd     -4 * SIZE(BO, BI, SIZE)  , %ymm4
+        vbroadcastsd     -3 * SIZE(BO, BI, SIZE)  , %ymm5
+        VFMADDPD_R(        %ymm8 ,%ymm4,%ymm0 )
+        VFMADDPD_R(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPD_I(        %ymm9 ,%ymm5,%ymm0 )
+        VFMADDPD_I(        %ymm13,%ymm5,%ymm1 )
+
+        addq    $ 2, BI                           
+        addq    $ 8, %rax                         
+.endm
+
+.macro SAVE4x1
+
+	vbroadcastsd	ALPHA_R, %ymm0
+	vbroadcastsd	ALPHA_I, %ymm1
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %ymm9, %ymm8  , %ymm8
+        vaddsubpd %ymm13,%ymm12 , %ymm12
+
+        vshufpd $ 0x05, %ymm8 , %ymm8, %ymm9
+        vshufpd $ 0x05, %ymm12, %ymm12, %ymm13
+
+#else
+        vaddsubpd %ymm8, %ymm9 , %ymm9
+        vaddsubpd %ymm12,%ymm13, %ymm13
+
+        vmovapd   %ymm9,  %ymm8
+        vmovapd   %ymm13, %ymm12
+
+	// swap high and low 8 bytes
+        vshufpd $ 0x05, %ymm9 , %ymm9, %ymm9
+        vshufpd $ 0x05, %ymm13, %ymm13, %ymm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %ymm8 , %ymm0, %ymm8
+        vmulpd  %ymm12, %ymm0, %ymm12
+
+	// multiply with ALPHA_I
+        vmulpd  %ymm9 , %ymm1, %ymm9
+        vmulpd  %ymm13, %ymm1, %ymm13
+
+	vaddsubpd %ymm9,  %ymm8 , %ymm8
+        vaddsubpd %ymm13, %ymm12, %ymm12
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %ymm8 , %ymm8
+	vaddpd  4 * SIZE(CO1), %ymm12, %ymm12
+
+#endif
+
+	vmovups	%ymm8 ,  	(CO1)
+	vmovups	%ymm12 ,4 * SIZE(CO1)
+
+.endm
+
+
+
+/************************************************************************************************/
+
+.macro KERNEL2x1_SUB
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPD_R(        %xmm8,%xmm4,%xmm0  )
+        vmovups          -6 * SIZE(AO, %rax, SIZE), %xmm1
+        VFMADDPD_R(        %xmm12,%xmm4,%xmm1 )
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPD_I(        %xmm9,%xmm5,%xmm0  )
+        VFMADDPD_I(        %xmm13,%xmm5,%xmm1 )
+        addq    $ 2, BI                           
+        addq    $ 4, %rax                         
+.endm
+
+.macro SAVE2x1
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8  , %xmm8
+        vaddsubpd %xmm13,%xmm12 , %xmm12
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+        vshufpd $ 0x01, %xmm12, %xmm12, %xmm13
+
+#else
+        vaddsubpd %xmm8, %xmm9 , %xmm9
+        vaddsubpd %xmm12,%xmm13, %xmm13
+
+        vmovapd   %xmm9,  %xmm8
+        vmovapd   %xmm13, %xmm12
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+        vshufpd $ 0x01, %xmm13, %xmm13, %xmm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+        vmulpd  %xmm12, %xmm0, %xmm12
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+        vmulpd  %xmm13, %xmm1, %xmm13
+
+	vaddsubpd %xmm9,  %xmm8 , %xmm8
+        vaddsubpd %xmm13, %xmm12, %xmm12
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+	vaddpd  2 * SIZE(CO1), %xmm12, %xmm12
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 2 * SIZE(CO1)
+
+.endm
+
+
+/************************************************************************************************/
+
+.macro KERNEL1x1_SUB
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0
+        vmovddup         -4 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPD_R(        %xmm8,%xmm4,%xmm0 )
+        vmovddup         -3 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPD_I(        %xmm9,%xmm5,%xmm0 )
+        addq    $ 2, BI                           
+        addq    $ 2, %rax                         
+.endm
+
+.macro SAVE1x1
+
+	vmovddup	ALPHA_R, %xmm0
+	vmovddup	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubpd %xmm9, %xmm8,  %xmm8
+
+        vshufpd $ 0x01, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubpd %xmm8, %xmm9,  %xmm9
+
+        vmovapd   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufpd $ 0x01, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulpd  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulpd  %xmm9 , %xmm1, %xmm9
+
+	vaddsubpd %xmm9 ,%xmm8,  %xmm8
+
+#ifndef TRMMKERNEL
+
+	vaddpd 	 	(CO1), %xmm8 , %xmm8
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+.endm
+
+
+/************************************************************************************************/
+
+
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$ STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $ 128 + L_BUFFER_SIZE, %rsp
+        andq    $ -4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA_R
+	vmovsd	 %xmm1, ALPHA_I
+
+	salq	$ ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $ 6,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+	
+
+/************************************************************************************************/
+.L6_00_0:
+
+	movq	Ndiv6,  J
+	cmpq	$ 0, J
+	je	.L2_00_0
+	ALIGN_4
+
+
+
+.L6_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 2 * COMPSIZE
+	leaq	(B, %rax,8), BO2 
+	movq	BO2, B			// next offset of B
+	movq	K, %rax
+	ALIGN_4
+
+.L6_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	        (BO2), %xmm2
+	vmovups	%xmm0,         (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	vmovups	%xmm2, 4 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L6_00_02b
+
+.L6_00_02c:
+
+
+
+.L6_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	movq	A, AO		 	// aoffset = a
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L6_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L6_4_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_4_16
+	ALIGN_4
+
+.L6_4_12:
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L6_4_16
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L6_4_16
+
+	jmp	.L6_4_12
+	ALIGN_4
+
+.L6_4_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_4_19
+	ALIGN_4
+
+.L6_4_17:
+
+	KERNEL4x3_SUB
+
+	jnz	.L6_4_17
+	ALIGN_4
+
+
+.L6_4_19:
+
+	SAVE4x3
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L6_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L6_2_10:
+	testq	$ 2, M		
+	jz	.L6_2_40		// to next 2 lines of N
+
+.L6_2_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_2_16
+	ALIGN_4
+
+.L6_2_12:
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L6_2_16
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L6_2_16
+
+	jmp	.L6_2_12
+	ALIGN_4
+
+.L6_2_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_2_19
+	ALIGN_4
+
+.L6_2_17:
+
+	KERNEL2x3_SUB
+
+	jnz	.L6_2_17
+	ALIGN_4
+
+
+.L6_2_19:
+
+	SAVE2x3
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_2_40:
+	testq	$ 1, M		
+	jz	.L6_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L6_2_41:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L6_2_46
+
+	ALIGN_4
+
+.L6_2_42:
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L6_2_46
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L6_2_46
+
+	jmp	.L6_2_42
+	ALIGN_4
+
+.L6_2_46:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L6_2_49
+
+	ALIGN_4
+
+.L6_2_47:
+
+	KERNEL1x3_SUB
+
+	jnz	.L6_2_47
+	ALIGN_4
+
+
+.L6_2_49:
+
+	SAVE1x3
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L6_2_41
+	ALIGN_4	
+
+
+
+	
+.L6_2_60:
+
+
+/************************************************************************************************/
+
+/************************************************************************************************/
+
+
+.L7_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	salq	$2, %rax		// 2 * COMPSIZE
+	leaq	(B, %rax,8), BO2 
+	movq	K, %rax
+	ALIGN_4
+
+.L7_00_02b:
+
+	vmovups	2 * SIZE(BO1), %xmm0
+	vmovups	        (BO2), %xmm1
+	vmovups	2 * SIZE(BO2), %xmm2
+	vmovups	%xmm0,         (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	vmovups	%xmm2, 4 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 6*SIZE,BO
+	decq	%rax
+	jnz	.L7_00_02b
+
+.L7_00_02c:
+
+	movq	BO2, B			// next offset of B
+
+
+.L7_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	movq	A, AO		 	// aoffset = a
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L7_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L7_4_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_4_16
+	ALIGN_4
+
+.L7_4_12:
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L7_4_16
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+	KERNEL4x3_SUB
+
+	je	.L7_4_16
+
+	jmp	.L7_4_12
+	ALIGN_4
+
+.L7_4_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_4_19
+
+	ALIGN_4
+
+.L7_4_17:
+
+	KERNEL4x3_SUB
+
+	jnz	.L7_4_17
+	ALIGN_4
+
+
+.L7_4_19:
+
+	SAVE4x3
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L7_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L7_2_10:
+	testq	$ 2, M		
+	jz	.L7_2_40		// to next 2 lines of N
+
+.L7_2_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_2_16
+	ALIGN_4
+
+.L7_2_12:
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L7_2_16
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+	KERNEL2x3_SUB
+
+	je	.L7_2_16
+
+	jmp	.L7_2_12
+	ALIGN_4
+
+.L7_2_16:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_2_19
+
+	ALIGN_4
+
+.L7_2_17:
+
+	KERNEL2x3_SUB
+
+	jnz	.L7_2_17
+	ALIGN_4
+
+
+.L7_2_19:
+
+	SAVE2x3
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_2_40:
+	testq	$ 1, M		
+	jz	.L7_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L7_2_41:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L7_2_46
+
+	ALIGN_4
+
+.L7_2_42:
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L7_2_46
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+	KERNEL1x3_SUB
+
+	je	.L7_2_46
+
+	jmp	.L7_2_42
+	ALIGN_4
+
+.L7_2_46:
+        movq    K, %rax
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L7_2_49
+	ALIGN_4
+
+.L7_2_47:
+
+	KERNEL1x3_SUB
+
+	jnz	.L7_2_47
+	ALIGN_4
+
+
+.L7_2_49:
+
+	SAVE1x3
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L7_2_41
+	ALIGN_4	
+
+
+
+	
+.L7_2_60:
+
+	decq	J			// j --
+	jg	.L6_00_01		// next 6 lines of N
+
+/************************************************************************************************/
+
+
+
+/************************************************************************************************/
+.L2_00_0:
+
+	movq	Nmod6,  J
+	sarq	$1, J		// j = j / 2
+	cmpq	$ 0, J
+	je	.L1_2_0
+	ALIGN_4
+
+
+
+.L2_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L2_00_02b
+
+.L2_00_02c:
+
+	movq	BO1, B			// next offset of B
+
+
+.L2_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L2_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L2_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_12:
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	jmp	.L2_4_12
+	ALIGN_4
+
+.L2_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_17:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_4_17
+	ALIGN_4
+
+
+.L2_4_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L2_2_10:
+	testq	$ 2, M		
+	jz	.L2_2_40		// to next 2 lines of N
+
+.L2_2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_2_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_2_16
+
+	jmp	.L2_2_12
+	ALIGN_4
+
+.L2_2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_17:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_2_17
+	ALIGN_4
+
+
+.L2_2_19:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_2_40:
+	testq	$ 1, M		
+	jz	.L2_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_2_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_2_46
+
+	jmp	.L2_2_42
+	ALIGN_4
+
+.L2_2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_2_47
+	ALIGN_4
+
+
+.L2_2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L2_2_41
+	ALIGN_4	
+
+
+
+	
+.L2_2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_00_01			// next 2 lines of N
+
+
+
+.L1_2_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$ 1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO
+	decq	%rax
+	jnz	.L1_00_02b
+
+.L1_00_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L1_2_10
+
+	ALIGN_4
+
+/*******************************************************************************************************/
+
+
+.L1_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_12:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	jmp	.L1_4_12
+	ALIGN_4
+
+.L1_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_17:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_4_17
+	ALIGN_4
+
+
+.L1_4_19:
+
+	SAVE4x1
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_4_11
+	ALIGN_4	
+
+
+
+
+/*******************************************************************************************************/
+.L1_2_10:
+	testq	$ 2, M		
+	jz	.L1_2_40
+
+
+.L1_2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_2_16
+
+	jmp	.L1_2_12
+	ALIGN_4
+
+.L1_2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_17:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_2_17
+	ALIGN_4
+
+
+.L1_2_19:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_2_40:
+	testq	$ 1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_2_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_2_46
+
+	jmp	.L1_2_42
+	ALIGN_4
+
+.L1_2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_2_47
+	ALIGN_4
+
+
+.L1_2_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L1_2_41
+	ALIGN_4	
+
+
+
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$ STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/************************************************************************************************
+ TRMM Kernel
+************************************************************************************************/
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$ STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $ 128 + L_BUFFER_SIZE, %rsp
+        andq    $ -4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA_R
+	vmovsd	 %xmm1, ALPHA_I
+
+	salq	$ ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $ 2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+.L2_00_0:
+
+	movq	Ndiv6,  J
+	cmpq	$ 0, J
+	je	.L1_2_0
+	ALIGN_4
+
+
+
+.L2_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L2_00_02b
+
+.L2_00_02c:
+
+	movq	BO1, B			// next offset of B
+
+
+.L2_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L2_2_10
+
+	ALIGN_4
+
+/******************************************************************************************************************/
+
+.L2_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_12:
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+        prefetcht0      B_PR1(BO,BI  ,SIZE)
+	KERNEL4x2_SUB
+        prefetcht0      A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	jmp	.L2_4_12
+	ALIGN_4
+
+.L2_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_17:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_4_17
+	ALIGN_4
+
+
+.L2_4_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_4_11
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+
+/******************************************************************************************************************/
+.L2_2_10:
+	testq	$ 2, M		
+	jz	.L2_2_40		// to next 2 lines of N
+
+.L2_2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_2_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_2_16
+
+	jmp	.L2_2_12
+	ALIGN_4
+
+.L2_2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_17:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_2_17
+	ALIGN_4
+
+
+.L2_2_19:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_2_40:
+	testq	$ 1, M		
+	jz	.L2_2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_2_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_2_46
+
+	jmp	.L2_2_42
+	ALIGN_4
+
+.L2_2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_2_47
+	ALIGN_4
+
+
+.L2_2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L2_2_41
+	ALIGN_4	
+
+
+
+	
+.L2_2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_00_01			// next 2 lines of N
+
+
+
+.L1_2_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$ 1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_00_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_00_02b:
+
+	vmovups		(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO
+	decq	%rax
+	jnz	.L1_00_02b
+
+.L1_00_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_00_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 8 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 2, I			// i = (m >> 2)
+	je	.L1_2_10
+
+	ALIGN_4
+
+/*******************************************************************************************************/
+
+
+.L1_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_12:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	jmp	.L1_4_12
+	ALIGN_4
+
+.L1_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_17:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_4_17
+	ALIGN_4
+
+
+.L1_4_19:
+
+	SAVE4x1
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_4_11
+	ALIGN_4	
+
+
+
+
+/*******************************************************************************************************/
+.L1_2_10:
+	testq	$ 2, M		
+	jz	.L1_2_40
+
+
+.L1_2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_2_16
+
+	jmp	.L1_2_12
+	ALIGN_4
+
+.L1_2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_17:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_2_17
+	ALIGN_4
+
+
+.L1_2_19:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+
+	ALIGN_4	
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_2_40:
+	testq	$ 1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_2_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_2_46
+
+	jmp	.L1_2_42
+	ALIGN_4
+
+.L1_2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_2_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_2_47
+	ALIGN_4
+
+
+.L1_2_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L1_2_41
+	ALIGN_4	
+
+
+
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$ STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+#endif
+
+
diff --git a/relapack/src/CMakeLists.txt b/relapack/src/CMakeLists.txt
index 78fb1431f..b92089418 100644
--- a/relapack/src/CMakeLists.txt
+++ b/relapack/src/CMakeLists.txt
@@ -1,86 +1,86 @@
-include_directories(${PROJECT_SOURCE_DIR})
-include_directories(${PROJECT_BINARY_DIR})
-include_directories(${PROJECT_SOURCE_DIR}/relapack)
-
-set(RELAFILES
-clauum.c
-ctrsyl_rec2.c
-dsytrf.c
-spbtrf.c
-strsyl_rec2.c
-zhetrf_rook_rec2.c
-ztrsyl.c
-cgbtrf.c
-cpbtrf.c
-ctrtri.c
-dsytrf_rec2.c
-spotrf.c
-strtri.c
-zlauum.c
-ztrsyl_rec2.c
-cgemmt.c
-cpotrf.c
-dgbtrf.c
-dsytrf_rook.c
-lapack_wrappers.c
-ssygst.c
-zgbtrf.c
-zpbtrf.c
-ztrtri.c
-cgetrf.c
-csytrf.c
-dgemmt.c
-dsytrf_rook_rec2.c
-ssytrf.c
-zgemmt.c
-zpotrf.c
-chegst.c
-csytrf_rec2.c
-dgetrf.c
-dtgsyl.c
-ssytrf_rec2.c
-zgetrf.c
-zsytrf.c
-chetrf.c
-csytrf_rook.c
-dlauum.c
-dtrsyl.c
-sgbtrf.c
-ssytrf_rook.c
-zhegst.c
-zsytrf_rec2.c
-chetrf_rec2.c
-csytrf_rook_rec2.c
-dpbtrf.c
-dtrsyl_rec2.c
-sgemmt.c
-ssytrf_rook_rec2.c
-zhetrf.c
-zsytrf_rook.c
-chetrf_rook.c
-ctgsyl.c
-dpotrf.c
-dtrtri.c
-sgetrf.c
-stgsyl.c
-zhetrf_rec2.c
-zsytrf_rook_rec2.c
-chetrf_rook_rec2.c
-ctrsyl.c
-dsygst.c
-f2c.c
-slauum.c
-strsyl.c
-zhetrf_rook.c
-ztgsyl.c
-)
-
-
-
-# add relapack folder to the sources
-set(RELA_SOURCES "")
-foreach (RELA_FILE ${RELAFILES})
-  list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}")
-endforeach ()
-add_library(relapack_src OBJECT ${RELA_SOURCES})
-set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(${PROJECT_BINARY_DIR})
+include_directories(${PROJECT_SOURCE_DIR}/relapack)
+
+set(RELAFILES
+clauum.c
+ctrsyl_rec2.c
+dsytrf.c
+spbtrf.c
+strsyl_rec2.c
+zhetrf_rook_rec2.c
+ztrsyl.c
+cgbtrf.c
+cpbtrf.c
+ctrtri.c
+dsytrf_rec2.c
+spotrf.c
+strtri.c
+zlauum.c
+ztrsyl_rec2.c
+cgemmt.c
+cpotrf.c
+dgbtrf.c
+dsytrf_rook.c
+lapack_wrappers.c
+ssygst.c
+zgbtrf.c
+zpbtrf.c
+ztrtri.c
+cgetrf.c
+csytrf.c
+dgemmt.c
+dsytrf_rook_rec2.c
+ssytrf.c
+zgemmt.c
+zpotrf.c
+chegst.c
+csytrf_rec2.c
+dgetrf.c
+dtgsyl.c
+ssytrf_rec2.c
+zgetrf.c
+zsytrf.c
+chetrf.c
+csytrf_rook.c
+dlauum.c
+dtrsyl.c
+sgbtrf.c
+ssytrf_rook.c
+zhegst.c
+zsytrf_rec2.c
+chetrf_rec2.c
+csytrf_rook_rec2.c
+dpbtrf.c
+dtrsyl_rec2.c
+sgemmt.c
+ssytrf_rook_rec2.c
+zhetrf.c
+zsytrf_rook.c
+chetrf_rook.c
+ctgsyl.c
+dpotrf.c
+dtrtri.c
+sgetrf.c
+stgsyl.c
+zhetrf_rec2.c
+zsytrf_rook_rec2.c
+chetrf_rook_rec2.c
+ctrsyl.c
+dsygst.c
+f2c.c
+slauum.c
+strsyl.c
+zhetrf_rook.c
+ztgsyl.c
+)
+
+
+
+# add relapack folder to the sources
+set(RELA_SOURCES "")
+foreach (RELA_FILE ${RELAFILES})
+  list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}")
+endforeach ()
+add_library(relapack_src OBJECT ${RELA_SOURCES})
+set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")

From f73cfb7e2cf5a2a225bda6642c13c83b7bf9df29 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Nov 2022 09:39:56 +0100
Subject: [PATCH 101/154] change line endings from CRLF to LF

---
 kernel/x86_64/cgemm_kernel_4x2_bulldozer.S   |  3794 +++---
 kernel/x86_64/cgemm_kernel_4x2_piledriver.S  |  3842 +++---
 kernel/x86_64/cgemm_kernel_8x2_sandy.S       |  4706 ++++----
 kernel/x86_64/dgemm_kernel_16x2_haswell.S    | 10430 ++++++++--------
 kernel/x86_64/dgemm_kernel_4x4_haswell.S     |  6988 +++++------
 kernel/x86_64/dgemm_kernel_4x8_haswell.S     | 10306 ++++++++--------
 kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c  |  1340 +--
 kernel/x86_64/dgemm_kernel_8x2_bulldozer.S   |  8826 +++++++-------
 kernel/x86_64/dgemm_kernel_8x2_piledriver.S  |  9046 +++++++-------
 kernel/x86_64/sgemm_kernel_16x2_bulldozer.S  | 10462 ++++++++--------
 kernel/x86_64/sgemm_kernel_16x2_piledriver.S | 10516 ++++++++---------
 kernel/x86_64/sgemm_kernel_16x4_sandy.S      |  6334 +++++-----
 kernel/x86_64/strsm_kernel_8x4_haswell_RN.c  |   558 +-
 kernel/x86_64/strsm_kernel_8x4_haswell_RT.c  |   562 +-
 14 files changed, 43855 insertions(+), 43855 deletions(-)

diff --git a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
index 97958a88f..2675f71fb 100644
--- a/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
+++ b/kernel/x86_64/cgemm_kernel_4x2_bulldozer.S
@@ -1,1897 +1,1897 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 320
-
-#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
-#define OLD_A           48 + STACKSIZE(%rsp)
-#define OLD_B           56 + STACKSIZE(%rsp)
-#define OLD_C           64 + STACKSIZE(%rsp)
-#define OLD_LDC         72 + STACKSIZE(%rsp)
-#define OLD_OFFSET      80 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA_R  48(%rsp)
-#define ALPHA_I  56(%rsp)
-#define OFFSET   64(%rsp)
-#define KK       72(%rsp)
-#define KKK      80(%rsp)
-#define BUFFER1	           128(%rsp)
-#define BUFFER2	LB2_OFFSET+128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define VFMADD_R    vfmaddps
-#define VFMADD_I    vfmaddps
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define VFMADD_R    vfnmaddps
-#define VFMADD_I    vfmaddps
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define VFMADD_R    vfmaddps
-#define VFMADD_I    vfnmaddps
-#else
-#define VFMADD_R    vfnmaddps
-#define VFMADD_I    vfnmaddps
-#endif
-
-
-
-#define	A_PR1	384
-#define	B_PR1	192
-
-#define KERNEL4x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL4x2_2(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL4x2_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL4x2_4(xx) \
-        vmovups           8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $16, BI                            ;\
-        addq    $32, %rax                          ;\
-
-
-#define KERNEL4x2_SUB(xx) \
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $4, BI                            ;\
-        addq    $8, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL2x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL2x2_2(xx) \
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL2x2_3(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL2x2_4(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $16, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x2_SUB(xx) \
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $4, BI                            ;\
-        addq    $4, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL1x2_1(xx) \
-        vmovsd          -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_2(xx) \
-        vmovsd          -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_3(xx) \
-        vmovsd          -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_4(xx) \
-        vmovsd          -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $16, BI                            ;\
-        addq    $8, %rax                          ;\
-
-
-#define KERNEL1x2_SUB(xx) \
-        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $4, BI                            ;\
-        addq    $2, %rax                          ;\
-
-
-
-/************************************************************************************************/
-
-#define KERNEL4x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL4x1_2(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL4x1_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL4x1_4(xx) \
-        vmovups           8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $8, BI                            ;\
-        addq    $32, %rax                          ;\
-
-
-#define KERNEL4x1_SUB(xx) \
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $2, BI                            ;\
-        addq    $8, %rax                          ;\
-
-
-/************************************************************************************************/
-
-#define KERNEL2x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL2x1_2(xx) \
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL2x1_3(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL2x1_4(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $8, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x1_SUB(xx) \
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $2, BI                            ;\
-        addq    $4, %rax                          ;\
-
-
-/************************************************************************************************/
-
-#define KERNEL1x1_1(xx) \
-        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_2(xx) \
-        vmovsd         -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_3(xx) \
-        vmovsd         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_4(xx) \
-        vmovsd          -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $8, BI                            ;\
-        addq    $8, %rax                          ;\
-
-
-#define KERNEL1x1_SUB(xx) \
-        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $2, BI                            ;\
-        addq    $2, %rax                          ;\
-
-
-/************************************************************************************************/
-
-
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	vmovsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovss	 %xmm0, ALPHA_R
-	vmovss	 %xmm1, ALPHA_I
-
-	salq	$ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-.L2_0:
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$4*SIZE,BO1
-	addq	$4*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = (m >> 2)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL4x2_SUB(xxx)
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-        vshufps $0xb1, %xmm13, %xmm13, %xmm13
-        vshufps $0xb1, %xmm15, %xmm15, %xmm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-        vaddsubps %xmm13,%xmm12, %xmm12
-        vaddsubps %xmm15,%xmm14, %xmm14
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $0xb1, %xmm10, %xmm10, %xmm11
-        vshufps $0xb1, %xmm12, %xmm12, %xmm13
-        vshufps $0xb1, %xmm14, %xmm14, %xmm15
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-        vaddsubps %xmm12, %xmm13,%xmm13
-        vaddsubps %xmm14, %xmm15,%xmm15
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-        vmovaps   %xmm13, %xmm12
-        vmovaps   %xmm15, %xmm14
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-        vshufps $0xb1, %xmm13, %xmm13, %xmm13
-        vshufps $0xb1, %xmm15, %xmm15, %xmm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-        vmulps  %xmm12, %xmm0, %xmm12
-        vmulps  %xmm14, %xmm0, %xmm14
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-        vmulps  %xmm13, %xmm1, %xmm13
-        vmulps  %xmm15, %xmm1, %xmm15
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-        vaddsubps %xmm13,%xmm12, %xmm12
-        vaddsubps %xmm15,%xmm14, %xmm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddps  4 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 4 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 4 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-.L2_20:
-	testq	$3, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$2, M		
-	jz	.L2_40
-	ALIGN_4
-
-.L2_21:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_26
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL2x2_SUB(xxx)
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $0xb1, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-
-	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-
-/**************************************************************************/
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $0xb1, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-
-
-#ifndef TRMMKERNEL
-
-	vmovsd		(CO1), %xmm14
-	vaddps 	 	%xmm14, %xmm8 , %xmm8
-
-	vmovsd		(CO1, LDC), %xmm15
-	vaddps 	 	%xmm15, %xmm10, %xmm10
-
-#endif
-
-	vmovsd	%xmm8 ,  	(CO1)
-
-	vmovsd	%xmm10 ,  	(CO1, LDC)
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovsd		(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = (m >> 2)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL4x1_SUB(xxx)
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm13, %xmm13, %xmm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm13,%xmm12, %xmm12
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $0xb1, %xmm12, %xmm12, %xmm13
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm12, %xmm13,%xmm13
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm13, %xmm12
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm13, %xmm13, %xmm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm12, %xmm0, %xmm12
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm13, %xmm1, %xmm13
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm13,%xmm12, %xmm12
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 4 * SIZE(CO1)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-.L1_20:
-	testq	$3, M		
-	jz	.L999
-
-	testq	$2, M		
-	jz	.L1_40
-	ALIGN_4
-
-.L1_21:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_26
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL2x1_SUB(xxx)
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-
-        vmovaps   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-
-/**************************************************************************/
-.L1_40:
-	testq	$1, M		
-	jz	.L999		// to next 2 lines of N
-
-	ALIGN_4
-
-.L1_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-
-        vmovaps   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-
-
-
-#ifndef TRMMKERNEL
-
-	vmovsd		(CO1), %xmm14
-	vaddps 	 	%xmm14, %xmm8 , %xmm8
-
-#endif
-
-	vmovsd	%xmm8 ,  	(CO1)
-
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 320
+
+#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
+#define OLD_A           48 + STACKSIZE(%rsp)
+#define OLD_B           56 + STACKSIZE(%rsp)
+#define OLD_C           64 + STACKSIZE(%rsp)
+#define OLD_LDC         72 + STACKSIZE(%rsp)
+#define OLD_OFFSET      80 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA_R  48(%rsp)
+#define ALPHA_I  56(%rsp)
+#define OFFSET   64(%rsp)
+#define KK       72(%rsp)
+#define KKK      80(%rsp)
+#define BUFFER1	           128(%rsp)
+#define BUFFER2	LB2_OFFSET+128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define VFMADD_R    vfmaddps
+#define VFMADD_I    vfmaddps
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define VFMADD_R    vfnmaddps
+#define VFMADD_I    vfmaddps
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define VFMADD_R    vfmaddps
+#define VFMADD_I    vfnmaddps
+#else
+#define VFMADD_R    vfnmaddps
+#define VFMADD_I    vfnmaddps
+#endif
+
+
+
+#define	A_PR1	384
+#define	B_PR1	192
+
+#define KERNEL4x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL4x2_2(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL4x2_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL4x2_4(xx) \
+        vmovups           8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $16, BI                            ;\
+        addq    $32, %rax                          ;\
+
+
+#define KERNEL4x2_SUB(xx) \
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $4, BI                            ;\
+        addq    $8, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL2x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL2x2_2(xx) \
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL2x2_3(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL2x2_4(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $16, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x2_SUB(xx) \
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $4, BI                            ;\
+        addq    $4, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL1x2_1(xx) \
+        vmovsd          -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_2(xx) \
+        vmovsd          -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_3(xx) \
+        vmovsd          -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_4(xx) \
+        vmovsd          -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $16, BI                            ;\
+        addq    $8, %rax                          ;\
+
+
+#define KERNEL1x2_SUB(xx) \
+        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $4, BI                            ;\
+        addq    $2, %rax                          ;\
+
+
+
+/************************************************************************************************/
+
+#define KERNEL4x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL4x1_2(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL4x1_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL4x1_4(xx) \
+        vmovups           8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $8, BI                            ;\
+        addq    $32, %rax                          ;\
+
+
+#define KERNEL4x1_SUB(xx) \
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $2, BI                            ;\
+        addq    $8, %rax                          ;\
+
+
+/************************************************************************************************/
+
+#define KERNEL2x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL2x1_2(xx) \
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL2x1_3(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL2x1_4(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $8, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x1_SUB(xx) \
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $2, BI                            ;\
+        addq    $4, %rax                          ;\
+
+
+/************************************************************************************************/
+
+#define KERNEL1x1_1(xx) \
+        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_2(xx) \
+        vmovsd         -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_3(xx) \
+        vmovsd         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_4(xx) \
+        vmovsd          -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $8, BI                            ;\
+        addq    $8, %rax                          ;\
+
+
+#define KERNEL1x1_SUB(xx) \
+        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $2, BI                            ;\
+        addq    $2, %rax                          ;\
+
+
+/************************************************************************************************/
+
+
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	vmovsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA_R
+	vmovss	 %xmm1, ALPHA_I
+
+	salq	$ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+.L2_0:
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$4*SIZE,BO1
+	addq	$4*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = (m >> 2)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL4x2_SUB(xxx)
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+        vshufps $0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $0xb1, %xmm15, %xmm15, %xmm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+        vaddsubps %xmm13,%xmm12, %xmm12
+        vaddsubps %xmm15,%xmm14, %xmm14
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $0xb1, %xmm10, %xmm10, %xmm11
+        vshufps $0xb1, %xmm12, %xmm12, %xmm13
+        vshufps $0xb1, %xmm14, %xmm14, %xmm15
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+        vaddsubps %xmm12, %xmm13,%xmm13
+        vaddsubps %xmm14, %xmm15,%xmm15
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+        vmovaps   %xmm13, %xmm12
+        vmovaps   %xmm15, %xmm14
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+        vshufps $0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $0xb1, %xmm15, %xmm15, %xmm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+        vmulps  %xmm12, %xmm0, %xmm12
+        vmulps  %xmm14, %xmm0, %xmm14
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+        vmulps  %xmm13, %xmm1, %xmm13
+        vmulps  %xmm15, %xmm1, %xmm15
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+        vaddsubps %xmm13,%xmm12, %xmm12
+        vaddsubps %xmm15,%xmm14, %xmm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddps  4 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 4 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 4 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L2_20:
+	testq	$3, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$2, M		
+	jz	.L2_40
+	ALIGN_4
+
+.L2_21:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_26
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL2x2_SUB(xxx)
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vmovsd		(CO1), %xmm14
+	vaddps 	 	%xmm14, %xmm8 , %xmm8
+
+	vmovsd		(CO1, LDC), %xmm15
+	vaddps 	 	%xmm15, %xmm10, %xmm10
+
+#endif
+
+	vmovsd	%xmm8 ,  	(CO1)
+
+	vmovsd	%xmm10 ,  	(CO1, LDC)
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd		(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = (m >> 2)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL4x1_SUB(xxx)
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm13, %xmm13, %xmm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $0xb1, %xmm12, %xmm12, %xmm13
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm12, %xmm13,%xmm13
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm13, %xmm12
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm13, %xmm13, %xmm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm12, %xmm0, %xmm12
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm13, %xmm1, %xmm13
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 4 * SIZE(CO1)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L1_20:
+	testq	$3, M		
+	jz	.L999
+
+	testq	$2, M		
+	jz	.L1_40
+	ALIGN_4
+
+.L1_21:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_26
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL2x1_SUB(xxx)
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+
+        vmovaps   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L1_40:
+	testq	$1, M		
+	jz	.L999		// to next 2 lines of N
+
+	ALIGN_4
+
+.L1_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+
+        vmovaps   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+
+
+
+#ifndef TRMMKERNEL
+
+	vmovsd		(CO1), %xmm14
+	vaddps 	 	%xmm14, %xmm8 , %xmm8
+
+#endif
+
+	vmovsd	%xmm8 ,  	(CO1)
+
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S
index 72deee12f..bf7f91ee9 100644
--- a/kernel/x86_64/cgemm_kernel_4x2_piledriver.S
+++ b/kernel/x86_64/cgemm_kernel_4x2_piledriver.S
@@ -1,1921 +1,1921 @@
-/***************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-/*********************************************************************
-*
-* 2014/06/28 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-*
-* 2013/10/31 Saar
-*
-* Parameter:
-*       UNROLL_M        4
-*       UNROLL_N        2
-*       CGEMM_P         768
-*       CGEMM_Q         168
-*       A_PR1           512
-*       B_PR1           256
-*
-* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
-* 
-* 4608x4608    154.0    GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 )
-* 4608x4608    148.3    GFLOPS with 4 threads on 4 modules (ACML:  96.0 ) (BULLDOZER: 143.2 )
-* 3456x3456     74.3    GFLOPS with 2 threads on 2 modules (ACML:  47.3 ) (BULLDOZER:  72.3 )
-* 3456x3456     37.3    GFLOPS with 1 threads on 1 modules (ACML:  24.2 ) (BULLDOZER:  36.5 )
-*
-* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
-* 
-* 6912x6912    421.5    GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 )
-* 6912x6912    407.0    GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 )
-* 6912x6912    234.2    GFLOPS with  8 threads on  8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 )
-* 4608x4608    123.1    GFLOPS with  4 threads on  4 modules (ACML:  87.9 ) (BULLDOZER: 120.9 )
-* 3456x3456     62.6    GFLOPS with  2 threads on  2 modules (ACML:  44.5 ) (BULLDOZER:  62.1 )
-* 3456x3456     31.8    GFLOPS with  1 threads on  1 modules (ACML:  22.6 ) (BULLDOZER:  31.4 )
-*
-*********************************************************************/
-
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 320
-
-#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
-#define OLD_A           48 + STACKSIZE(%rsp)
-#define OLD_B           56 + STACKSIZE(%rsp)
-#define OLD_C           64 + STACKSIZE(%rsp)
-#define OLD_LDC         72 + STACKSIZE(%rsp)
-#define OLD_OFFSET      80 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 256*8*4
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA_R  48(%rsp)
-#define ALPHA_I  56(%rsp)
-#define OFFSET   64(%rsp)
-#define KK       72(%rsp)
-#define KKK      80(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-#define VFMADD_R    vfmaddps
-#define VFMADD_I    vfmaddps
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-#define VFMADD_R    vfnmaddps
-#define VFMADD_I    vfmaddps
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-#define VFMADD_R    vfmaddps
-#define VFMADD_I    vfnmaddps
-#else
-#define VFMADD_R    vfnmaddps
-#define VFMADD_I    vfnmaddps
-#endif
-
-
-
-#define	A_PR1	512
-#define	B_PR1	256
-
-#define KERNEL4x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL4x2_2(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL4x2_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-
-#define KERNEL4x2_4(xx) \
-        vmovups           8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $16, BI                            ;\
-        addq    $32, %rax                          ;\
-
-
-#define KERNEL4x2_SUB(xx) \
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
-        addq    $4, BI                            ;\
-        addq    $8, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL2x2_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL2x2_2(xx) \
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL2x2_3(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL2x2_4(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $16, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x2_SUB(xx) \
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $4, BI                            ;\
-        addq    $4, %rax                          ;\
-
-/************************************************************************************************/
-
-#define KERNEL1x2_1(xx) \
-        vmovsd          -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_2(xx) \
-        vmovsd          -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_3(xx) \
-        vmovsd          -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-
-#define KERNEL1x2_4(xx) \
-        vmovsd          -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $16, BI                            ;\
-        addq    $8, %rax                          ;\
-
-
-#define KERNEL1x2_SUB(xx) \
-        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
-        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
-        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
-        addq    $4, BI                            ;\
-        addq    $2, %rax                          ;\
-
-
-
-/************************************************************************************************/
-
-#define KERNEL4x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL4x1_2(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL4x1_3(xx) \
-	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
-        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-
-#define KERNEL4x1_4(xx) \
-        vmovups           8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups          12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $8, BI                            ;\
-        addq    $32, %rax                          ;\
-
-
-#define KERNEL4x1_SUB(xx) \
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
-        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
-        addq    $2, BI                            ;\
-        addq    $8, %rax                          ;\
-
-
-/************************************************************************************************/
-
-#define KERNEL2x1_1(xx) \
-        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL2x1_2(xx) \
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL2x1_3(xx) \
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL2x1_4(xx) \
-        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $8, BI                            ;\
-        addq    $16, %rax                          ;\
-
-
-#define KERNEL2x1_SUB(xx) \
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $2, BI                            ;\
-        addq    $4, %rax                          ;\
-
-
-/************************************************************************************************/
-
-#define KERNEL1x1_1(xx) \
-        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_2(xx) \
-        vmovsd         -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_3(xx) \
-        vmovsd         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-
-#define KERNEL1x1_4(xx) \
-        vmovsd          -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $8, BI                            ;\
-        addq    $8, %rax                          ;\
-
-
-#define KERNEL1x1_SUB(xx) \
-        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
-        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
-        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
-        addq    $2, BI                            ;\
-        addq    $2, %rax                          ;\
-
-
-/************************************************************************************************/
-
-
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovss	 %xmm0, ALPHA_R
-	vmovss	 %xmm1, ALPHA_I
-
-	salq	$ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-.L2_0:
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$4*SIZE,BO1
-	addq	$4*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = (m >> 2)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL4x2_SUB(xxx)
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-        vshufps $0xb1, %xmm13, %xmm13, %xmm13
-        vshufps $0xb1, %xmm15, %xmm15, %xmm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-        vaddsubps %xmm13,%xmm12, %xmm12
-        vaddsubps %xmm15,%xmm14, %xmm14
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $0xb1, %xmm10, %xmm10, %xmm11
-        vshufps $0xb1, %xmm12, %xmm12, %xmm13
-        vshufps $0xb1, %xmm14, %xmm14, %xmm15
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-        vaddsubps %xmm12, %xmm13,%xmm13
-        vaddsubps %xmm14, %xmm15,%xmm15
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-        vmovaps   %xmm13, %xmm12
-        vmovaps   %xmm15, %xmm14
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-        vshufps $0xb1, %xmm13, %xmm13, %xmm13
-        vshufps $0xb1, %xmm15, %xmm15, %xmm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-        vmulps  %xmm12, %xmm0, %xmm12
-        vmulps  %xmm14, %xmm0, %xmm14
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-        vmulps  %xmm13, %xmm1, %xmm13
-        vmulps  %xmm15, %xmm1, %xmm15
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-        vaddsubps %xmm13,%xmm12, %xmm12
-        vaddsubps %xmm15,%xmm14, %xmm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddps  4 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 4 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 4 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-.L2_20:
-	testq	$3, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$2, M		
-	jz	.L2_40
-	ALIGN_4
-
-.L2_21:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_26
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL2x2_SUB(xxx)
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $0xb1, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-
-	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-
-/**************************************************************************/
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $0xb1, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-
-
-#ifndef TRMMKERNEL
-
-	vmovsd		(CO1), %xmm14
-	vaddps 	 	%xmm14, %xmm8 , %xmm8
-
-	vmovsd		(CO1, LDC), %xmm15
-	vaddps 	 	%xmm15, %xmm10, %xmm10
-
-#endif
-
-	vmovsd	%xmm8 ,  	(CO1)
-
-	vmovsd	%xmm10 ,  	(CO1, LDC)
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovsd		(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = (m >> 2)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL4x1_SUB(xxx)
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm13, %xmm13, %xmm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm13,%xmm12, %xmm12
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $0xb1, %xmm12, %xmm12, %xmm13
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm12, %xmm13,%xmm13
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm13, %xmm12
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $0xb1, %xmm13, %xmm13, %xmm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm12, %xmm0, %xmm12
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm13, %xmm1, %xmm13
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm13,%xmm12, %xmm12
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 4 * SIZE(CO1)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-.L1_20:
-	testq	$3, M		
-	jz	.L999
-
-	testq	$2, M		
-	jz	.L1_40
-	ALIGN_4
-
-.L1_21:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_26
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL2x1_SUB(xxx)
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-
-        vmovaps   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-
-/**************************************************************************/
-.L1_40:
-	testq	$1, M		
-	jz	.L999		// to next 2 lines of N
-
-	ALIGN_4
-
-.L1_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-
-        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-
-        vmovaps   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-
-
-
-#ifndef TRMMKERNEL
-
-	vmovsd		(CO1), %xmm14
-	vaddps 	 	%xmm14, %xmm8 , %xmm8
-
-#endif
-
-	vmovsd	%xmm8 ,  	(CO1)
-
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-
-
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+/*********************************************************************
+*
+* 2014/06/28 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2013/10/31 Saar
+*
+* Parameter:
+*       UNROLL_M        4
+*       UNROLL_N        2
+*       CGEMM_P         768
+*       CGEMM_Q         168
+*       A_PR1           512
+*       B_PR1           256
+*
+* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
+* 
+* 4608x4608    154.0    GFLOPS with 8 threads on 4 modules (ACML: 111.7 ) (BULLDOZER: 153.9 )
+* 4608x4608    148.3    GFLOPS with 4 threads on 4 modules (ACML:  96.0 ) (BULLDOZER: 143.2 )
+* 3456x3456     74.3    GFLOPS with 2 threads on 2 modules (ACML:  47.3 ) (BULLDOZER:  72.3 )
+* 3456x3456     37.3    GFLOPS with 1 threads on 1 modules (ACML:  24.2 ) (BULLDOZER:  36.5 )
+*
+* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
+* 
+* 6912x6912    421.5    GFLOPS with 32 threads on 16 modules (ACML: 266.6 ) (BULLDOZER: 422.5 )
+* 6912x6912    407.0    GFLOPS with 16 threads on 16 modules (ACML: 271.5 ) (BULLDOZER: 404.7 )
+* 6912x6912    234.2    GFLOPS with  8 threads on  8 modules (ACML: 164.0 ) (BULLDOZER: 230.5 )
+* 4608x4608    123.1    GFLOPS with  4 threads on  4 modules (ACML:  87.9 ) (BULLDOZER: 120.9 )
+* 3456x3456     62.6    GFLOPS with  2 threads on  2 modules (ACML:  44.5 ) (BULLDOZER:  62.1 )
+* 3456x3456     31.8    GFLOPS with  1 threads on  1 modules (ACML:  22.6 ) (BULLDOZER:  31.4 )
+*
+*********************************************************************/
+
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 320
+
+#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
+#define OLD_A           48 + STACKSIZE(%rsp)
+#define OLD_B           56 + STACKSIZE(%rsp)
+#define OLD_C           64 + STACKSIZE(%rsp)
+#define OLD_LDC         72 + STACKSIZE(%rsp)
+#define OLD_OFFSET      80 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 256*8*4
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA_R  48(%rsp)
+#define ALPHA_I  56(%rsp)
+#define OFFSET   64(%rsp)
+#define KK       72(%rsp)
+#define KKK      80(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define VFMADD_R    vfmaddps
+#define VFMADD_I    vfmaddps
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define VFMADD_R    vfnmaddps
+#define VFMADD_I    vfmaddps
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define VFMADD_R    vfmaddps
+#define VFMADD_I    vfnmaddps
+#else
+#define VFMADD_R    vfnmaddps
+#define VFMADD_I    vfnmaddps
+#endif
+
+
+
+#define	A_PR1	512
+#define	B_PR1	256
+
+#define KERNEL4x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL4x2_2(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL4x2_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+
+#define KERNEL4x2_4(xx) \
+        vmovups           8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $16, BI                            ;\
+        addq    $32, %rax                          ;\
+
+
+#define KERNEL4x2_SUB(xx) \
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        VFMADD_R        %xmm14,%xmm6,%xmm1,%xmm14 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        VFMADD_I        %xmm15,%xmm7,%xmm1,%xmm15 ;\
+        addq    $4, BI                            ;\
+        addq    $8, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL2x2_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL2x2_2(xx) \
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL2x2_3(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL2x2_4(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $16, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x2_SUB(xx) \
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $4, BI                            ;\
+        addq    $4, %rax                          ;\
+
+/************************************************************************************************/
+
+#define KERNEL1x2_1(xx) \
+        vmovsd          -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_2(xx) \
+        vmovsd          -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_3(xx) \
+        vmovsd          -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+
+#define KERNEL1x2_4(xx) \
+        vmovsd          -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          5 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss          6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss          7 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $16, BI                            ;\
+        addq    $8, %rax                          ;\
+
+
+#define KERNEL1x2_SUB(xx) \
+        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6 ;\
+        VFMADD_R        %xmm10,%xmm6,%xmm0,%xmm10 ;\
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7 ;\
+        VFMADD_I        %xmm11,%xmm7,%xmm0,%xmm11 ;\
+        addq    $4, BI                            ;\
+        addq    $2, %rax                          ;\
+
+
+
+/************************************************************************************************/
+
+#define KERNEL4x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL4x1_2(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL4x1_3(xx) \
+	prefetcht0      A_PR1+64(AO,%rax,SIZE)     ;\
+        vmovups           0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups           4 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+
+#define KERNEL4x1_4(xx) \
+        vmovups           8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups          12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $8, BI                            ;\
+        addq    $32, %rax                          ;\
+
+
+#define KERNEL4x1_SUB(xx) \
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1 ;\
+        VFMADD_R        %xmm12,%xmm4,%xmm1,%xmm12 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        VFMADD_I        %xmm13,%xmm5,%xmm1,%xmm13 ;\
+        addq    $2, BI                            ;\
+        addq    $8, %rax                          ;\
+
+
+/************************************************************************************************/
+
+#define KERNEL2x1_1(xx) \
+        prefetcht0      A_PR1(AO,%rax,SIZE)        ;\
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL2x1_2(xx) \
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL2x1_3(xx) \
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL2x1_4(xx) \
+        vmovups          -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $8, BI                            ;\
+        addq    $16, %rax                          ;\
+
+
+#define KERNEL2x1_SUB(xx) \
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $2, BI                            ;\
+        addq    $4, %rax                          ;\
+
+
+/************************************************************************************************/
+
+#define KERNEL1x1_1(xx) \
+        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_2(xx) \
+        vmovsd         -14 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_3(xx) \
+        vmovsd         -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+
+#define KERNEL1x1_4(xx) \
+        vmovsd          -10 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $8, BI                            ;\
+        addq    $8, %rax                          ;\
+
+
+#define KERNEL1x1_SUB(xx) \
+        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4 ;\
+        VFMADD_R        %xmm8,%xmm4,%xmm0,%xmm8 ;\
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5 ;\
+        VFMADD_I        %xmm9,%xmm5,%xmm0,%xmm9 ;\
+        addq    $2, BI                            ;\
+        addq    $2, %rax                          ;\
+
+
+/************************************************************************************************/
+
+
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA_R
+	vmovss	 %xmm1, ALPHA_I
+
+	salq	$ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+.L2_0:
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$4*SIZE,BO1
+	addq	$4*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = (m >> 2)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL4x2_SUB(xxx)
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+        vshufps $0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $0xb1, %xmm15, %xmm15, %xmm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+        vaddsubps %xmm13,%xmm12, %xmm12
+        vaddsubps %xmm15,%xmm14, %xmm14
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $0xb1, %xmm10, %xmm10, %xmm11
+        vshufps $0xb1, %xmm12, %xmm12, %xmm13
+        vshufps $0xb1, %xmm14, %xmm14, %xmm15
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+        vaddsubps %xmm12, %xmm13,%xmm13
+        vaddsubps %xmm14, %xmm15,%xmm15
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+        vmovaps   %xmm13, %xmm12
+        vmovaps   %xmm15, %xmm14
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+        vshufps $0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $0xb1, %xmm15, %xmm15, %xmm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+        vmulps  %xmm12, %xmm0, %xmm12
+        vmulps  %xmm14, %xmm0, %xmm14
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+        vmulps  %xmm13, %xmm1, %xmm13
+        vmulps  %xmm15, %xmm1, %xmm15
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+        vaddsubps %xmm13,%xmm12, %xmm12
+        vaddsubps %xmm15,%xmm14, %xmm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddps  4 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 4 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 4 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L2_20:
+	testq	$3, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$2, M		
+	jz	.L2_40
+	ALIGN_4
+
+.L2_21:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_26
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL2x2_SUB(xxx)
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vmovsd		(CO1), %xmm14
+	vaddps 	 	%xmm14, %xmm8 , %xmm8
+
+	vmovsd		(CO1, LDC), %xmm15
+	vaddps 	 	%xmm15, %xmm10, %xmm10
+
+#endif
+
+	vmovsd	%xmm8 ,  	(CO1)
+
+	vmovsd	%xmm10 ,  	(CO1, LDC)
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd		(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = (m >> 2)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL4x1_SUB(xxx)
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm13, %xmm13, %xmm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $0xb1, %xmm12, %xmm12, %xmm13
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm12, %xmm13,%xmm13
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm13, %xmm12
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $0xb1, %xmm13, %xmm13, %xmm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm12, %xmm0, %xmm12
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm13, %xmm1, %xmm13
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 4 * SIZE(CO1)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L1_20:
+	testq	$3, M		
+	jz	.L999
+
+	testq	$2, M		
+	jz	.L1_40
+	ALIGN_4
+
+.L1_21:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_26
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL2x1_SUB(xxx)
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+
+        vmovaps   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L1_40:
+	testq	$1, M		
+	jz	.L999		// to next 2 lines of N
+
+	ALIGN_4
+
+.L1_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+
+        vshufps $0xb1, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+
+        vmovaps   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufps $0xb1, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+
+
+
+#ifndef TRMMKERNEL
+
+	vmovsd		(CO1), %xmm14
+	vaddps 	 	%xmm14, %xmm8 , %xmm8
+
+#endif
+
+	vmovsd	%xmm8 ,  	(CO1)
+
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+
+
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/cgemm_kernel_8x2_sandy.S b/kernel/x86_64/cgemm_kernel_8x2_sandy.S
index c85646d43..988913591 100644
--- a/kernel/x86_64/cgemm_kernel_8x2_sandy.S
+++ b/kernel/x86_64/cgemm_kernel_8x2_sandy.S
@@ -1,2353 +1,2353 @@
-/*********************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************************/
-
-/*********************************************************************
-* 2014/07/29 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-* 2013/10/28 Saar
-* Parameter:
-*       CGEMM_DEFAULT_UNROLL_N  2
-*       CGEMM_DEFAULT_UNROLL_M  8
-*       CGEMM_DEFAULT_P         768
-*       CGEMM_DEFAULT_Q         512
-*       A_PR1                   512
-*       B_PR1                   512
-*
-* 2014/07/29 Saar
-* Performance at 6192x6192x6192:
-*       1 thread:       49 GFLOPS       (MKL:   52)
-*       2 threads:      99 GFLOPS       (MKL:  102)
-*       3 threads:     148 GFLOPS       (MKL:  150)
-*       4 threads:     195 GFLOPS       (MKL:  194)
-*       8 threads:     354 GFLOPS       (MKL:  317)
-*
-*
-*********************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 320
-
-#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
-#define OLD_A           48 + STACKSIZE(%rsp)
-#define OLD_B           56 + STACKSIZE(%rsp)
-#define OLD_C           64 + STACKSIZE(%rsp)
-#define OLD_LDC         72 + STACKSIZE(%rsp)
-#define OLD_OFFSET      80 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA_R  48(%rsp)
-#define ALPHA_I  56(%rsp)
-#define OFFSET   64(%rsp)
-#define KK       72(%rsp)
-#define KKK      80(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 4(%rsp);\
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-
-#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
-
-#define	VFMADDPS_YR( y0,y1,y2 ) \
-                               vmulps y1,y2,%ymm2;\
-                               vaddps y0,%ymm2,y0
-
-#define	VFMADDPS_YI( y0,y1,y2 ) \
-                               vmulps y1,y2,%ymm3;\
-                               vaddps y0,%ymm3,y0
-
-#define	VFMADDPS_R( y0,y1,y2 ) \
-                               vmulps y1,y2,%xmm2;\
-                               vaddps y0,%xmm2,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) \
-                               vmulps y1,y2,%xmm3;\
-                               vaddps y0,%xmm3,y0
-
-
-#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
-
-#define	VFMADDPS_YR( y0,y1,y2 ) \
-                               vmulps y1,y2,%ymm2;\
-                               vsubps %ymm2,y0,y0
-
-#define	VFMADDPS_YI( y0,y1,y2 ) \
-                               vmulps y1,y2,%ymm3;\
-                               vaddps y0,%ymm3,y0
-
-#define	VFMADDPS_R( y0,y1,y2 ) \
-                               vmulps y1,y2,%xmm2;\
-                               vsubps %xmm2,y0,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) \
-                               vmulps y1,y2,%xmm3;\
-                               vaddps y0,%xmm3,y0
-
-
-#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-#define	VFMADDPS_YR( y0,y1,y2 ) \
-                               vmulps y1,y2,%ymm2;\
-                               vaddps y0,%ymm2,y0
-
-#define	VFMADDPS_YI( y0,y1,y2 ) \
-                               vmulps y1,y2,%ymm3;\
-                               vsubps %ymm3,y0,y0
-
-#define	VFMADDPS_R( y0,y1,y2 ) \
-                               vmulps y1,y2,%xmm2;\
-                               vaddps y0,%xmm2,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) \
-                               vmulps y1,y2,%xmm3;\
-                               vsubps %xmm3,y0,y0
-
-
-#else
-
-#define	VFMADDPS_YR( y0,y1,y2 ) \
-                               vmulps y1,y2,%ymm2;\
-                               vsubps %ymm2,y0,y0
-
-#define	VFMADDPS_YI( y0,y1,y2 ) \
-                               vmulps y1,y2,%ymm3;\
-                               vsubps %ymm3,y0,y0
-
-#define	VFMADDPS_R( y0,y1,y2 ) \
-                               vmulps y1,y2,%xmm2;\
-                               vsubps %xmm2,y0,y0
-
-#define	VFMADDPS_I( y0,y1,y2 ) \
-                               vmulps y1,y2,%xmm3;\
-                               vsubps %xmm3,y0,y0
-
-
-#endif
-
-
-#define	A_PR1	512
-#define	B_PR1	512
-
-/***************************************************************************************************************************/
-
-.macro KERNEL8x2_1
-
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-
-        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6
-        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7
-        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
-        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
-
-
-        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %ymm4
-        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %ymm5
-        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
-        vmovups           0 * SIZE(AO, %rax, SIZE), %ymm0
-        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
-
-        vmovups           8 * SIZE(AO, %rax, SIZE), %ymm1
-	prefetcht0	A_PR1+64(AO, %rax, SIZE)
-
-        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
-        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %ymm6
-        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
-        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %ymm7
-        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
-        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
-
-
-        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
-        vbroadcastss          0 * SIZE(BO, BI, SIZE), %ymm4
-        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
-        vbroadcastss          1 * SIZE(BO, BI, SIZE), %ymm5
-        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
-        vmovups          16 * SIZE(AO, %rax, SIZE), %ymm0
-        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
-
-        vmovups          24 * SIZE(AO, %rax, SIZE), %ymm1
-	prefetcht0	A_PR1+128(AO, %rax, SIZE)
-
-        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
-        vbroadcastss          2 * SIZE(BO, BI, SIZE), %ymm6
-        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
-        vbroadcastss          3 * SIZE(BO, BI, SIZE), %ymm7
-        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
-        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
-
-
-        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
-        vbroadcastss          4 * SIZE(BO, BI, SIZE), %ymm4
-        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
-        vbroadcastss          5 * SIZE(BO, BI, SIZE), %ymm5
-        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
-        vmovups          32 * SIZE(AO, %rax, SIZE), %ymm0
-        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
-
-        vmovups          40 * SIZE(AO, %rax, SIZE), %ymm1
-	prefetcht0	A_PR1+192(AO, %rax, SIZE)
-
-        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
-        vbroadcastss          6 * SIZE(BO, BI, SIZE), %ymm6
-        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
-        vbroadcastss          7 * SIZE(BO, BI, SIZE), %ymm7
-        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
-        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
-
-        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
-        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
-        addq    $ 16, BI                           
-        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
-        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
-
-        addq    $ 64, %rax                         
-.endm
-
-
-.macro KERNEL8x2_SUB
-
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5
-
-        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6
-        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7
-        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
-        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
-
-
-        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
-        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
-        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
-        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
-
-        addq    $ 4 , BI                           
-        addq    $ 16, %rax                         
-.endm
-
-.macro SAVE8x2
-
-	vbroadcastss	ALPHA_R, %ymm0
-	vbroadcastss	ALPHA_I, %ymm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
-        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
-        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
-        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %ymm9, %ymm8 , %ymm8
-        vaddsubps %ymm11,%ymm10, %ymm10
-        vaddsubps %ymm13,%ymm12, %ymm12
-        vaddsubps %ymm15,%ymm14, %ymm14
-
-        vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
-        vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
-        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
-        vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
-
-#else
-        vaddsubps %ymm8,  %ymm9 ,%ymm9
-        vaddsubps %ymm10, %ymm11,%ymm11
-        vaddsubps %ymm12, %ymm13,%ymm13
-        vaddsubps %ymm14, %ymm15,%ymm15
-
-        vmovaps   %ymm9,  %ymm8
-        vmovaps   %ymm11, %ymm10
-        vmovaps   %ymm13, %ymm12
-        vmovaps   %ymm15, %ymm14
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
-        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
-        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
-        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %ymm8 , %ymm0, %ymm8
-        vmulps  %ymm10, %ymm0, %ymm10
-        vmulps  %ymm12, %ymm0, %ymm12
-        vmulps  %ymm14, %ymm0, %ymm14
-
-	// multiply with ALPHA_I
-        vmulps  %ymm9 , %ymm1, %ymm9
-        vmulps  %ymm11, %ymm1, %ymm11
-        vmulps  %ymm13, %ymm1, %ymm13
-        vmulps  %ymm15, %ymm1, %ymm15
-
-	vaddsubps %ymm9, %ymm8 , %ymm8
-        vaddsubps %ymm11,%ymm10, %ymm10
-        vaddsubps %ymm13,%ymm12, %ymm12
-        vaddsubps %ymm15,%ymm14, %ymm14
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %ymm8 , %ymm8
-	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
-
-	vaddps 	 	(CO1, LDC), %ymm10, %ymm10
-	vaddps  8 * SIZE(CO1, LDC), %ymm14, %ymm14
-
-#endif
-
-	vmovups	%ymm8 ,  	(CO1)
-	vmovups	%ymm12 , 8 * SIZE(CO1)
-
-	vmovups	%ymm10 ,  	(CO1, LDC)
-	vmovups	%ymm14 , 8 * SIZE(CO1, LDC)
-
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1, LDC)
-
-.endm
-
-/***************************************************************************************************************************/
-
-.macro KERNEL4x2_SUB
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1
-        VFMADDPS_R(        %xmm12,%xmm4,%xmm1 )
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
-        VFMADDPS_I(        %xmm13,%xmm5,%xmm1 )
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
-        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
-        VFMADDPS_R(        %xmm14,%xmm6,%xmm1 )
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
-        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
-        VFMADDPS_I(        %xmm15,%xmm7,%xmm1 )
-        addq    $ 4, BI                           
-        addq    $ 8, %rax                         
-.endm
-
-.macro SAVE4x2
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
-        vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-        vaddsubps %xmm13,%xmm12, %xmm12
-        vaddsubps %xmm15,%xmm14, %xmm14
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
-        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
-        vshufps $ 0xb1, %xmm14, %xmm14, %xmm15
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-        vaddsubps %xmm12, %xmm13,%xmm13
-        vaddsubps %xmm14, %xmm15,%xmm15
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-        vmovaps   %xmm13, %xmm12
-        vmovaps   %xmm15, %xmm14
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
-        vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-        vmulps  %xmm12, %xmm0, %xmm12
-        vmulps  %xmm14, %xmm0, %xmm14
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-        vmulps  %xmm13, %xmm1, %xmm13
-        vmulps  %xmm15, %xmm1, %xmm15
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-        vaddsubps %xmm13,%xmm12, %xmm12
-        vaddsubps %xmm15,%xmm14, %xmm14
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
-
-	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
-	vaddps  4 * SIZE(CO1, LDC), %xmm14, %xmm14
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 4 * SIZE(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-	vmovups	%xmm14 , 4 * SIZE(CO1, LDC)
-
-.endm
-
-/************************************************************************************************/
-
-.macro KERNEL2x2_SUB
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
-        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
-        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
-        addq    $ 4, BI                           
-        addq    $ 4, %rax                         
-.endm
-
-.macro SAVE2x2
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 4 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-
-	// swap high and low 4 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-
-	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-.endm
-
-/************************************************************************************************/
-
-.macro KERNEL1x2_SUB
-        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
-        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
-        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
-        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
-        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
-        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
-        addq    $ 4, BI                           
-        addq    $ 2, %rax                         
-.endm
-
-.macro SAVE1x2
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-#ifndef TRMMKERNEL
-
-	vmovsd		(CO1), %xmm14
-	vaddps 	 	%xmm14, %xmm8 , %xmm8
-
-	vmovsd		(CO1, LDC), %xmm15
-	vaddps 	 	%xmm15, %xmm10, %xmm10
-
-#endif
-
-	vmovsd	%xmm8 ,  	(CO1)
-	vmovsd	%xmm10 ,  	(CO1, LDC)
-
-.endm
-
-/************************************************************************************************/
-
-.macro KERNEL8x1_SUB
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
-        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %ymm4
-        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
-        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %ymm5
-        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
-        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
-        addq    $ 2 , BI                           
-        addq    $ 16, %rax                         
-.endm
-
-.macro SAVE8x1
-
-	vbroadcastss	ALPHA_R, %ymm0
-	vbroadcastss	ALPHA_I, %ymm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
-        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %ymm9, %ymm8 , %ymm8
-        vaddsubps %ymm13,%ymm12, %ymm12
-
-        vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
-        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
-
-#else
-        vaddsubps %ymm8,  %ymm9 ,%ymm9
-        vaddsubps %ymm12, %ymm13,%ymm13
-
-        vmovaps   %ymm9,  %ymm8
-        vmovaps   %ymm13, %ymm12
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
-        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %ymm8 , %ymm0, %ymm8
-        vmulps  %ymm12, %ymm0, %ymm12
-
-	// multiply with ALPHA_I
-        vmulps  %ymm9 , %ymm1, %ymm9
-        vmulps  %ymm13, %ymm1, %ymm13
-
-	vaddsubps %ymm9, %ymm8 , %ymm8
-        vaddsubps %ymm13,%ymm12, %ymm12
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %ymm8 , %ymm8
-	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
-
-#endif
-
-	vmovups	%ymm8 ,  	(CO1)
-	vmovups	%ymm12 , 8 * SIZE(CO1)
-
-.endm
-
-
-/************************************************************************************************/
-
-.macro KERNEL4x1_SUB
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
-        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1
-        VFMADDPS_R(        %xmm12,%xmm4,%xmm1 )
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
-        VFMADDPS_I(        %xmm13,%xmm5,%xmm1 )
-        addq    $ 2, BI                           
-        addq    $ 8, %rax                         
-.endm
-
-.macro SAVE4x1
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 4 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm13,%xmm12, %xmm12
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm12, %xmm13,%xmm13
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm13, %xmm12
-
-	// swap high and low 4 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm12, %xmm0, %xmm12
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm13, %xmm1, %xmm13
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm13,%xmm12, %xmm12
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-	vmovups	%xmm12 , 4 * SIZE(CO1)
-
-.endm
-
-/************************************************************************************************/
-
-.macro KERNEL2x1_SUB
-        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
-        addq    $ 2, BI                           
-        addq    $ 4, %rax                         
-.endm
-
-.macro SAVE2x1
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-
-        vmovaps   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-.endm
-
-/************************************************************************************************/
-
-.macro KERNEL1x1_SUB
-        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0
-        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
-        VFMADDPS_R(        %xmm8,%xmm4,%xmm0 )
-        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
-        VFMADDPS_I(        %xmm9,%xmm5,%xmm0 )
-        addq    $ 2, BI                           
-        addq    $ 2, %rax                         
-.endm
-
-.macro SAVE1x1
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-
-        vmovaps   %xmm9,  %xmm8
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-
-#ifndef TRMMKERNEL
-
-	vmovsd		(CO1), %xmm14
-	vaddps 	 	%xmm14, %xmm8 , %xmm8
-
-#endif
-
-	vmovsd	%xmm8 ,  	(CO1)
-
-.endm
-
-/************************************************************************************************/
-
-
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$ STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-	vmovsd   OLD_ALPHA_I, %xmm1
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $ 128 + L_BUFFER_SIZE, %rsp
-        andq    $ -4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$ 0, OLD_M
-	je	.L999
-
-	cmpq	$ 0, OLD_N
-	je	.L999
-
-	cmpq	$ 0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovss	 %xmm0, ALPHA_R
-	vmovss	 %xmm1, ALPHA_I
-
-	salq	$ ZBASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $ 2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-.L2_0:
-
-	movq	Ndiv6,  J
-	cmpq	$ 0, J
-	je	.L1_0
-	ALIGN_4
-
-
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 3, I			// i = (m >> 3)
-	je	.L2_4_10
-
-	ALIGN_4
-/**********************************************************************************************************/
-
-.L2_8_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 8, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_8_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_8_12:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_1
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_1
-
-	je	.L2_8_16
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_1
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x2_1
-
-	je	.L2_8_16
-
-	jmp	.L2_8_12
-	ALIGN_4
-
-.L2_8_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_8_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_8_17:
-
-	KERNEL8x2_SUB
-
-	jl	.L2_8_17
-	ALIGN_4
-
-
-.L2_8_19:
-
-	SAVE8x2
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 8, KK
-#endif
-
-	addq	$ 16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_8_11
-	ALIGN_4	
-
-
-/**********************************************************************************************************/
-
-
-
-
-.L2_4_10:
-	testq	$ 7, M		
-	jz	.L2_4_60		// to next 2 lines of N
-
-	testq	$ 4, M		
-	jz	.L2_4_20
-	ALIGN_4
-
-
-.L2_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_4_16
-
-	jmp	.L2_4_12
-	ALIGN_4
-
-.L2_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_17:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_4_17
-	ALIGN_4
-
-
-.L2_4_19:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	ALIGN_4	
-
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-.L2_4_20:
-
-	testq	$ 2, M		
-	jz	.L2_4_40
-	ALIGN_4
-
-.L2_4_21:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_4_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_22:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_4_26
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_4_26
-
-	jmp	.L2_4_22
-	ALIGN_4
-
-.L2_4_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_4_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_27:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_4_27
-	ALIGN_4
-
-
-.L2_4_29:
-
-	vbroadcastss	ALPHA_R, %xmm0
-	vbroadcastss	ALPHA_I, %xmm1
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
-    defined(NR) || defined(NC) || defined(TR) || defined(TC)
-
-        vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
-        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
-
-#else
-        vaddsubps %xmm8,  %xmm9 ,%xmm9
-        vaddsubps %xmm10, %xmm11,%xmm11
-
-        vmovaps   %xmm9,  %xmm8
-        vmovaps   %xmm11, %xmm10
-
-	// swap high and low 64 bytes
-        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
-        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
-
-#endif
-
-	// multiply with ALPHA_R
-        vmulps  %xmm8 , %xmm0, %xmm8
-        vmulps  %xmm10, %xmm0, %xmm10
-
-	// multiply with ALPHA_I
-        vmulps  %xmm9 , %xmm1, %xmm9
-        vmulps  %xmm11, %xmm1, %xmm11
-
-	vaddsubps %xmm9, %xmm8 , %xmm8
-        vaddsubps %xmm11,%xmm10, %xmm10
-
-
-
-#ifndef TRMMKERNEL
-
-	vaddps 	 	(CO1), %xmm8 , %xmm8
-
-	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
-
-#endif
-
-	vmovups	%xmm8 ,  	(CO1)
-
-	vmovups	%xmm10 ,  	(CO1, LDC)
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	decq	I			# i --
-	jg	.L2_4_21
-	ALIGN_4	
-
-
-
-/**************************************************************************/
-.L2_4_40:
-	testq	$ 1, M		
-	jz	.L2_4_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_4_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 8 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 8 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L2_4_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_4_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_4_46
-
-	jmp	.L2_4_42
-	ALIGN_4
-
-.L2_4_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L2_4_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_4_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_4_47
-	ALIGN_4
-
-
-.L2_4_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	decq	I			# i --
-	jg	.L2_4_41
-	ALIGN_4	
-
-
-
-	
-.L2_4_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$ 1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovsd		(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$ 2*SIZE,BO1
-	addq	$ 2*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$ 3, I			// i = (m >> 3)
-	je	.L1_4_10
-
-	ALIGN_4
-
-/**************************************************************************************************/
-
-.L1_8_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 8, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_8_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_8_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-
-	je	.L1_8_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL8x1_SUB
-
-	je	.L1_8_16
-
-	jmp	.L1_8_12
-	ALIGN_4
-
-.L1_8_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_8_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_8_17:
-
-	KERNEL8x1_SUB
-
-	jl	.L1_8_17
-	ALIGN_4
-
-
-.L1_8_19:
-
-	SAVE8x1
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 4, %rax			// rax = rax *16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 8, KK
-#endif
-
-	addq	$ 16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_8_11
-	ALIGN_4	
-
-
-
-/**************************************************************************************************/
-.L1_4_10:
-
-	testq	$ 7, M		
-	jz	.L999
-
-	testq	$ 4, M		
-	jz	.L1_4_20
-
-
-.L1_4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 4, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_12:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_4_16
-
-	jmp	.L1_4_12
-	ALIGN_4
-
-.L1_4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
-
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_17:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_4_17
-	ALIGN_4
-
-
-.L1_4_19:
-
-	SAVE4x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 4, KK
-#endif
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-	ALIGN_4	
-
-
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-.L1_4_20:
-
-	testq	$ 2, M		
-	jz	.L1_4_40
-	ALIGN_4
-
-.L1_4_21:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 2, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_4_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_22:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_4_26
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_4_26
-
-	jmp	.L1_4_22
-	ALIGN_4
-
-.L1_4_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_4_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
-
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_27:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_4_27
-	ALIGN_4
-
-
-.L1_4_29:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 2, %rax			// rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 2, KK
-#endif
-
-	addq	$ 4 * SIZE, CO1		# coffset += 4
-	ALIGN_4	
-
-
-
-/**************************************************************************/
-.L1_4_40:
-	testq	$ 1, M		
-	jz	.L999		// to next 2 lines of N
-
-	ALIGN_4
-
-.L1_4_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$ 4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO			// first buffer to BO
-	addq	$ 4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $ 1, %rax        // number of values in AO
-#else
-        addq    $ 1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$ -8, %rax			//  K = K - ( K % 8 )
-	je	.L1_4_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_42:
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_4_46
-
-	prefetcht0	A_PR1(AO,%rax,SIZE)
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_4_46
-
-	jmp	.L1_4_42
-	ALIGN_4
-
-.L1_4_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$ 7, %rax		# if (k & 1)
-	je .L1_4_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_4_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_4_47
-	ALIGN_4
-
-
-.L1_4_49:
-
-	SAVE1x1
-
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-	salq	$ 1, %rax			// rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $ 1, KK
-#endif
-
-	addq	$ 2 * SIZE, CO1		# coffset += 2
-	ALIGN_4	
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$ STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
+/*********************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************
+* 2014/07/29 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+* 2013/10/28 Saar
+* Parameter:
+*       CGEMM_DEFAULT_UNROLL_N  2
+*       CGEMM_DEFAULT_UNROLL_M  8
+*       CGEMM_DEFAULT_P         768
+*       CGEMM_DEFAULT_Q         512
+*       A_PR1                   512
+*       B_PR1                   512
+*
+* 2014/07/29 Saar
+* Performance at 6192x6192x6192:
+*       1 thread:       49 GFLOPS       (MKL:   52)
+*       2 threads:      99 GFLOPS       (MKL:  102)
+*       3 threads:     148 GFLOPS       (MKL:  150)
+*       4 threads:     195 GFLOPS       (MKL:  194)
+*       8 threads:     354 GFLOPS       (MKL:  317)
+*
+*
+*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 320
+
+#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
+#define OLD_A           48 + STACKSIZE(%rsp)
+#define OLD_B           56 + STACKSIZE(%rsp)
+#define OLD_C           64 + STACKSIZE(%rsp)
+#define OLD_LDC         72 + STACKSIZE(%rsp)
+#define OLD_OFFSET      80 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA_R  48(%rsp)
+#define ALPHA_I  56(%rsp)
+#define OFFSET   64(%rsp)
+#define KK       72(%rsp)
+#define KKK      80(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 4(%rsp);\
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+
+#define	VFMADDPS_YR( y0,y1,y2 ) \
+                               vmulps y1,y2,%ymm2;\
+                               vaddps y0,%ymm2,y0
+
+#define	VFMADDPS_YI( y0,y1,y2 ) \
+                               vmulps y1,y2,%ymm3;\
+                               vaddps y0,%ymm3,y0
+
+#define	VFMADDPS_R( y0,y1,y2 ) \
+                               vmulps y1,y2,%xmm2;\
+                               vaddps y0,%xmm2,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) \
+                               vmulps y1,y2,%xmm3;\
+                               vaddps y0,%xmm3,y0
+
+
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+
+#define	VFMADDPS_YR( y0,y1,y2 ) \
+                               vmulps y1,y2,%ymm2;\
+                               vsubps %ymm2,y0,y0
+
+#define	VFMADDPS_YI( y0,y1,y2 ) \
+                               vmulps y1,y2,%ymm3;\
+                               vaddps y0,%ymm3,y0
+
+#define	VFMADDPS_R( y0,y1,y2 ) \
+                               vmulps y1,y2,%xmm2;\
+                               vsubps %xmm2,y0,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) \
+                               vmulps y1,y2,%xmm3;\
+                               vaddps y0,%xmm3,y0
+
+
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+#define	VFMADDPS_YR( y0,y1,y2 ) \
+                               vmulps y1,y2,%ymm2;\
+                               vaddps y0,%ymm2,y0
+
+#define	VFMADDPS_YI( y0,y1,y2 ) \
+                               vmulps y1,y2,%ymm3;\
+                               vsubps %ymm3,y0,y0
+
+#define	VFMADDPS_R( y0,y1,y2 ) \
+                               vmulps y1,y2,%xmm2;\
+                               vaddps y0,%xmm2,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) \
+                               vmulps y1,y2,%xmm3;\
+                               vsubps %xmm3,y0,y0
+
+
+#else
+
+#define	VFMADDPS_YR( y0,y1,y2 ) \
+                               vmulps y1,y2,%ymm2;\
+                               vsubps %ymm2,y0,y0
+
+#define	VFMADDPS_YI( y0,y1,y2 ) \
+                               vmulps y1,y2,%ymm3;\
+                               vsubps %ymm3,y0,y0
+
+#define	VFMADDPS_R( y0,y1,y2 ) \
+                               vmulps y1,y2,%xmm2;\
+                               vsubps %xmm2,y0,y0
+
+#define	VFMADDPS_I( y0,y1,y2 ) \
+                               vmulps y1,y2,%xmm3;\
+                               vsubps %xmm3,y0,y0
+
+
+#endif
+
+
+#define	A_PR1	512
+#define	B_PR1	512
+
+/***************************************************************************************************************************/
+
+.macro KERNEL8x2_1
+
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+
+
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %ymm4
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %ymm5
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        vmovups           0 * SIZE(AO, %rax, SIZE), %ymm0
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
+
+        vmovups           8 * SIZE(AO, %rax, SIZE), %ymm1
+	prefetcht0	A_PR1+64(AO, %rax, SIZE)
+
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
+        vbroadcastss         -2 * SIZE(BO, BI, SIZE), %ymm6
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
+        vbroadcastss         -1 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+
+
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
+        vbroadcastss          0 * SIZE(BO, BI, SIZE), %ymm4
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
+        vbroadcastss          1 * SIZE(BO, BI, SIZE), %ymm5
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        vmovups          16 * SIZE(AO, %rax, SIZE), %ymm0
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
+
+        vmovups          24 * SIZE(AO, %rax, SIZE), %ymm1
+	prefetcht0	A_PR1+128(AO, %rax, SIZE)
+
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
+        vbroadcastss          2 * SIZE(BO, BI, SIZE), %ymm6
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
+        vbroadcastss          3 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+
+
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
+        vbroadcastss          4 * SIZE(BO, BI, SIZE), %ymm4
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
+        vbroadcastss          5 * SIZE(BO, BI, SIZE), %ymm5
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        vmovups          32 * SIZE(AO, %rax, SIZE), %ymm0
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
+
+        vmovups          40 * SIZE(AO, %rax, SIZE), %ymm1
+	prefetcht0	A_PR1+192(AO, %rax, SIZE)
+
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
+        vbroadcastss          6 * SIZE(BO, BI, SIZE), %ymm6
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
+        vbroadcastss          7 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
+        addq    $ 16, BI                           
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
+
+        addq    $ 64, %rax                         
+.endm
+
+
+.macro KERNEL8x2_SUB
+
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5
+
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+
+
+        VFMADDPS_YR(        %ymm10,%ymm6,%ymm0 )
+        VFMADDPS_YI(        %ymm11,%ymm7,%ymm0 )
+        VFMADDPS_YR(        %ymm14,%ymm6,%ymm1 )
+        VFMADDPS_YI(        %ymm15,%ymm7,%ymm1 )
+
+        addq    $ 4 , BI                           
+        addq    $ 16, %rax                         
+.endm
+
+.macro SAVE8x2
+
+	vbroadcastss	ALPHA_R, %ymm0
+	vbroadcastss	ALPHA_I, %ymm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
+        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm11,%ymm10, %ymm10
+        vaddsubps %ymm13,%ymm12, %ymm12
+        vaddsubps %ymm15,%ymm14, %ymm14
+
+        vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
+        vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
+        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
+        vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
+
+#else
+        vaddsubps %ymm8,  %ymm9 ,%ymm9
+        vaddsubps %ymm10, %ymm11,%ymm11
+        vaddsubps %ymm12, %ymm13,%ymm13
+        vaddsubps %ymm14, %ymm15,%ymm15
+
+        vmovaps   %ymm9,  %ymm8
+        vmovaps   %ymm11, %ymm10
+        vmovaps   %ymm13, %ymm12
+        vmovaps   %ymm15, %ymm14
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
+        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %ymm8 , %ymm0, %ymm8
+        vmulps  %ymm10, %ymm0, %ymm10
+        vmulps  %ymm12, %ymm0, %ymm12
+        vmulps  %ymm14, %ymm0, %ymm14
+
+	// multiply with ALPHA_I
+        vmulps  %ymm9 , %ymm1, %ymm9
+        vmulps  %ymm11, %ymm1, %ymm11
+        vmulps  %ymm13, %ymm1, %ymm13
+        vmulps  %ymm15, %ymm1, %ymm15
+
+	vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm11,%ymm10, %ymm10
+        vaddsubps %ymm13,%ymm12, %ymm12
+        vaddsubps %ymm15,%ymm14, %ymm14
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %ymm8 , %ymm8
+	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
+
+	vaddps 	 	(CO1, LDC), %ymm10, %ymm10
+	vaddps  8 * SIZE(CO1, LDC), %ymm14, %ymm14
+
+#endif
+
+	vmovups	%ymm8 ,  	(CO1)
+	vmovups	%ymm12 , 8 * SIZE(CO1)
+
+	vmovups	%ymm10 ,  	(CO1, LDC)
+	vmovups	%ymm14 , 8 * SIZE(CO1, LDC)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+
+.endm
+
+/***************************************************************************************************************************/
+
+.macro KERNEL4x2_SUB
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1
+        VFMADDPS_R(        %xmm12,%xmm4,%xmm1 )
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
+        VFMADDPS_I(        %xmm13,%xmm5,%xmm1 )
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
+        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
+        VFMADDPS_R(        %xmm14,%xmm6,%xmm1 )
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
+        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
+        VFMADDPS_I(        %xmm15,%xmm7,%xmm1 )
+        addq    $ 4, BI                           
+        addq    $ 8, %rax                         
+.endm
+
+.macro SAVE4x2
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+        vaddsubps %xmm13,%xmm12, %xmm12
+        vaddsubps %xmm15,%xmm14, %xmm14
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
+        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
+        vshufps $ 0xb1, %xmm14, %xmm14, %xmm15
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+        vaddsubps %xmm12, %xmm13,%xmm13
+        vaddsubps %xmm14, %xmm15,%xmm15
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+        vmovaps   %xmm13, %xmm12
+        vmovaps   %xmm15, %xmm14
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+        vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+        vmulps  %xmm12, %xmm0, %xmm12
+        vmulps  %xmm14, %xmm0, %xmm14
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+        vmulps  %xmm13, %xmm1, %xmm13
+        vmulps  %xmm15, %xmm1, %xmm15
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+        vaddsubps %xmm13,%xmm12, %xmm12
+        vaddsubps %xmm15,%xmm14, %xmm14
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+	vaddps  4 * SIZE(CO1, LDC), %xmm14, %xmm14
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 4 * SIZE(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+	vmovups	%xmm14 , 4 * SIZE(CO1, LDC)
+
+.endm
+
+/************************************************************************************************/
+
+.macro KERNEL2x2_SUB
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
+        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
+        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
+        addq    $ 4, BI                           
+        addq    $ 4, %rax                         
+.endm
+
+.macro SAVE2x2
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 4 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 4 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+.endm
+
+/************************************************************************************************/
+
+.macro KERNEL1x2_SUB
+        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
+        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
+        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
+        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
+        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
+        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
+        addq    $ 4, BI                           
+        addq    $ 2, %rax                         
+.endm
+
+.macro SAVE1x2
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+#ifndef TRMMKERNEL
+
+	vmovsd		(CO1), %xmm14
+	vaddps 	 	%xmm14, %xmm8 , %xmm8
+
+	vmovsd		(CO1, LDC), %xmm15
+	vaddps 	 	%xmm15, %xmm10, %xmm10
+
+#endif
+
+	vmovsd	%xmm8 ,  	(CO1)
+	vmovsd	%xmm10 ,  	(CO1, LDC)
+
+.endm
+
+/************************************************************************************************/
+
+.macro KERNEL8x1_SUB
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
+        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %ymm4
+        VFMADDPS_YR(        %ymm8,%ymm4,%ymm0  )
+        VFMADDPS_YR(        %ymm12,%ymm4,%ymm1 )
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %ymm5
+        VFMADDPS_YI(        %ymm9,%ymm5,%ymm0  )
+        VFMADDPS_YI(        %ymm13,%ymm5,%ymm1 )
+        addq    $ 2 , BI                           
+        addq    $ 16, %rax                         
+.endm
+
+.macro SAVE8x1
+
+	vbroadcastss	ALPHA_R, %ymm0
+	vbroadcastss	ALPHA_I, %ymm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm13,%ymm12, %ymm12
+
+        vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
+        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
+
+#else
+        vaddsubps %ymm8,  %ymm9 ,%ymm9
+        vaddsubps %ymm12, %ymm13,%ymm13
+
+        vmovaps   %ymm9,  %ymm8
+        vmovaps   %ymm13, %ymm12
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
+        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %ymm8 , %ymm0, %ymm8
+        vmulps  %ymm12, %ymm0, %ymm12
+
+	// multiply with ALPHA_I
+        vmulps  %ymm9 , %ymm1, %ymm9
+        vmulps  %ymm13, %ymm1, %ymm13
+
+	vaddsubps %ymm9, %ymm8 , %ymm8
+        vaddsubps %ymm13,%ymm12, %ymm12
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %ymm8 , %ymm8
+	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
+
+#endif
+
+	vmovups	%ymm8 ,  	(CO1)
+	vmovups	%ymm12 , 8 * SIZE(CO1)
+
+.endm
+
+
+/************************************************************************************************/
+
+.macro KERNEL4x1_SUB
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
+        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1
+        VFMADDPS_R(        %xmm12,%xmm4,%xmm1 )
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
+        VFMADDPS_I(        %xmm13,%xmm5,%xmm1 )
+        addq    $ 2, BI                           
+        addq    $ 8, %rax                         
+.endm
+
+.macro SAVE4x1
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 4 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm12, %xmm13,%xmm13
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm13, %xmm12
+
+	// swap high and low 4 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm12, %xmm0, %xmm12
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm13, %xmm1, %xmm13
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm13,%xmm12, %xmm12
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+	vmovups	%xmm12 , 4 * SIZE(CO1)
+
+.endm
+
+/************************************************************************************************/
+
+.macro KERNEL2x1_SUB
+        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
+        addq    $ 2, BI                           
+        addq    $ 4, %rax                         
+.endm
+
+.macro SAVE2x1
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+
+        vmovaps   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+.endm
+
+/************************************************************************************************/
+
+.macro KERNEL1x1_SUB
+        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0
+        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
+        VFMADDPS_R(        %xmm8,%xmm4,%xmm0 )
+        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
+        VFMADDPS_I(        %xmm9,%xmm5,%xmm0 )
+        addq    $ 2, BI                           
+        addq    $ 2, %rax                         
+.endm
+
+.macro SAVE1x1
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+
+        vmovaps   %xmm9,  %xmm8
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+
+#ifndef TRMMKERNEL
+
+	vmovsd		(CO1), %xmm14
+	vaddps 	 	%xmm14, %xmm8 , %xmm8
+
+#endif
+
+	vmovsd	%xmm8 ,  	(CO1)
+
+.endm
+
+/************************************************************************************************/
+
+
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$ STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+	vmovsd   OLD_ALPHA_I, %xmm1
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $ 128 + L_BUFFER_SIZE, %rsp
+        andq    $ -4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA_R
+	vmovss	 %xmm1, ALPHA_I
+
+	salq	$ ZBASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $ 2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+.L2_0:
+
+	movq	Ndiv6,  J
+	cmpq	$ 0, J
+	je	.L1_0
+	ALIGN_4
+
+
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 3, I			// i = (m >> 3)
+	je	.L2_4_10
+
+	ALIGN_4
+/**********************************************************************************************************/
+
+.L2_8_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 8, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_8_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_8_12:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_1
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_1
+
+	je	.L2_8_16
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_1
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x2_1
+
+	je	.L2_8_16
+
+	jmp	.L2_8_12
+	ALIGN_4
+
+.L2_8_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_8_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_8_17:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_8_17
+	ALIGN_4
+
+
+.L2_8_19:
+
+	SAVE8x2
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 8, KK
+#endif
+
+	addq	$ 16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_8_11
+	ALIGN_4	
+
+
+/**********************************************************************************************************/
+
+
+
+
+.L2_4_10:
+	testq	$ 7, M		
+	jz	.L2_4_60		// to next 2 lines of N
+
+	testq	$ 4, M		
+	jz	.L2_4_20
+	ALIGN_4
+
+
+.L2_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_4_16
+
+	jmp	.L2_4_12
+	ALIGN_4
+
+.L2_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_17:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_4_17
+	ALIGN_4
+
+
+.L2_4_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L2_4_20:
+
+	testq	$ 2, M		
+	jz	.L2_4_40
+	ALIGN_4
+
+.L2_4_21:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_22:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_4_26
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_4_26
+
+	jmp	.L2_4_22
+	ALIGN_4
+
+.L2_4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_27:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_4_27
+	ALIGN_4
+
+
+.L2_4_29:
+
+	vbroadcastss	ALPHA_R, %xmm0
+	vbroadcastss	ALPHA_I, %xmm1
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
+    defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+        vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
+        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
+
+#else
+        vaddsubps %xmm8,  %xmm9 ,%xmm9
+        vaddsubps %xmm10, %xmm11,%xmm11
+
+        vmovaps   %xmm9,  %xmm8
+        vmovaps   %xmm11, %xmm10
+
+	// swap high and low 64 bytes
+        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
+        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
+
+#endif
+
+	// multiply with ALPHA_R
+        vmulps  %xmm8 , %xmm0, %xmm8
+        vmulps  %xmm10, %xmm0, %xmm10
+
+	// multiply with ALPHA_I
+        vmulps  %xmm9 , %xmm1, %xmm9
+        vmulps  %xmm11, %xmm1, %xmm11
+
+	vaddsubps %xmm9, %xmm8 , %xmm8
+        vaddsubps %xmm11,%xmm10, %xmm10
+
+
+
+#ifndef TRMMKERNEL
+
+	vaddps 	 	(CO1), %xmm8 , %xmm8
+
+	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
+
+#endif
+
+	vmovups	%xmm8 ,  	(CO1)
+
+	vmovups	%xmm10 ,  	(CO1, LDC)
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	decq	I			# i --
+	jg	.L2_4_21
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L2_4_40:
+	testq	$ 1, M		
+	jz	.L2_4_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_4_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 8 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 8 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L2_4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_4_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_4_46
+
+	jmp	.L2_4_42
+	ALIGN_4
+
+.L2_4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L2_4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_4_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_4_47
+	ALIGN_4
+
+
+.L2_4_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	decq	I			# i --
+	jg	.L2_4_41
+	ALIGN_4	
+
+
+
+	
+.L2_4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$ 1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd		(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$ 3, I			// i = (m >> 3)
+	je	.L1_4_10
+
+	ALIGN_4
+
+/**************************************************************************************************/
+
+.L1_8_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 8, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_8_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_8_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	je	.L1_8_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL8x1_SUB
+
+	je	.L1_8_16
+
+	jmp	.L1_8_12
+	ALIGN_4
+
+.L1_8_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_8_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_8_17:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_8_17
+	ALIGN_4
+
+
+.L1_8_19:
+
+	SAVE8x1
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 4, %rax			// rax = rax *16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 8, KK
+#endif
+
+	addq	$ 16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_8_11
+	ALIGN_4	
+
+
+
+/**************************************************************************************************/
+.L1_4_10:
+
+	testq	$ 7, M		
+	jz	.L999
+
+	testq	$ 4, M		
+	jz	.L1_4_20
+
+
+.L1_4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 4, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_12:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_4_16
+
+	jmp	.L1_4_12
+	ALIGN_4
+
+.L1_4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
+
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_17:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_4_17
+	ALIGN_4
+
+
+.L1_4_19:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 4, KK
+#endif
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+	ALIGN_4	
+
+
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+.L1_4_20:
+
+	testq	$ 2, M		
+	jz	.L1_4_40
+	ALIGN_4
+
+.L1_4_21:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 2, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_22:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_4_26
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_4_26
+
+	jmp	.L1_4_22
+	ALIGN_4
+
+.L1_4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
+
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_27:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_4_27
+	ALIGN_4
+
+
+.L1_4_29:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 2, %rax			// rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 2, KK
+#endif
+
+	addq	$ 4 * SIZE, CO1		# coffset += 4
+	ALIGN_4	
+
+
+
+/**************************************************************************/
+.L1_4_40:
+	testq	$ 1, M		
+	jz	.L999		// to next 2 lines of N
+
+	ALIGN_4
+
+.L1_4_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$ 4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO			// first buffer to BO
+	addq	$ 4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $ 1, %rax        // number of values in AO
+#else
+        addq    $ 1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$ -8, %rax			//  K = K - ( K % 8 )
+	je	.L1_4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_42:
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_4_46
+
+	prefetcht0	A_PR1(AO,%rax,SIZE)
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_4_46
+
+	jmp	.L1_4_42
+	ALIGN_4
+
+.L1_4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$ 7, %rax		# if (k & 1)
+	je .L1_4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_4_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_4_47
+	ALIGN_4
+
+
+.L1_4_49:
+
+	SAVE1x1
+
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+	salq	$ 1, %rax			// rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $ 1, KK
+#endif
+
+	addq	$ 2 * SIZE, CO1		# coffset += 2
+	ALIGN_4	
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$ STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/dgemm_kernel_16x2_haswell.S b/kernel/x86_64/dgemm_kernel_16x2_haswell.S
index 98b582c0d..899c5f241 100644
--- a/kernel/x86_64/dgemm_kernel_16x2_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_16x2_haswell.S
@@ -1,5215 +1,5215 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-/*********************************************************************
-* 2013/10/20 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-
-*
-*
-* 2013/10/20 Saar
-* Parameter:
-*       DGEMM_DEFAULT_UNROLL_N  2
-*       DGEMM_DEFAULT_UNROLL_M  16
-*       DGEMM_DEFAULT_P         192
-*       DGEMM_DEFAULT_Q         128
-*	A_PR1			512
-*
-*
-* Performance without prefetch of B:
-*       1 thread:       45.8 GFLOPS (MKL:  45)
-*       2 threads:      80.0 GFLOPS (MKL:  91)
-*       4 threads:     135.0 GFLOPS (MKL: 135)
-*********************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 256
-
-#define OLD_A		40 + STACKSIZE(%rsp)
-#define OLD_B		48 + STACKSIZE(%rsp)
-#define OLD_C		56 + STACKSIZE(%rsp)
-#define OLD_LDC		64 + STACKSIZE(%rsp)
-#define OLD_OFFSET	72 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 512*8*4
-#define LB2_OFFSET    512*8*2
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA	 48(%rsp)
-#define OFFSET	 56(%rsp)
-#define KK	 64(%rsp)
-#define KKK	 72(%rsp)
-#define BUFFER1	           128(%rsp)
-#define BUFFER2	LB2_OFFSET+128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-#if defined(BULLDOZER)
-
-.macro VFMADD231PD_ y0,y1,y2
-	vfmaddpd \y0,\y1,\y2,\y0
-.endm
-
-.macro VFMADD231SD_ x0,x1,x2
-	vfmaddsd \x0,\x1,\x2,\x0
-.endm
-
-#else
-
-.macro VFMADD231PD_ y0,y1,y2
-	vfmadd231pd \y2,\y1,\y0
-.endm
-
-.macro VFMADD231SD_ x0,x1,x2
-	vfmadd231sd \x2,\x1,\x0
-.endm
-
-#endif
-
-
-#define	A_PR1	512
-#define	B_PR1	256
-
-/*******************************************************************************************
-* 3 lines of N
-*******************************************************************************************/
-
-.macro KERNEL16x3_SUBN
-	prefetcht0	A_PR1(AO)
-	vbroadcastsd	-12 * SIZE(BO), %ymm1
-	vmovaps 	-16 * SIZE(AO), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	-11 * SIZE(BO), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	-10 * SIZE(BO), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovaps 	-12 * SIZE(AO), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	prefetcht0	A_PR1+64(AO)
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-	vmovaps 	 -8 * SIZE(AO), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
-	vmovaps 	 -4 * SIZE(AO), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
-	addq	$ 3*SIZE , BO	
-	addq	$ 16*SIZE, AO
-.endm
-
-
-.macro KERNEL8x3_SUBN
-	//prefetcht0	A_PR1(AO)
-	vbroadcastsd	-12 * SIZE(BO), %ymm1
-	vmovaps 	-16 * SIZE(AO), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	-11 * SIZE(BO), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	-10 * SIZE(BO), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovaps 	-12 * SIZE(AO), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	//prefetcht0	A_PR1+64(AO)
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-	prefetcht0	B_PR1(BO)
-	addq	$ 3*SIZE , BO	
-	addq	$ 8*SIZE, AO
-.endm
-
-.macro KERNEL4x3_SUBN
-	vbroadcastsd	-12 * SIZE(BO), %ymm1
-	vmovaps 	-16 * SIZE(AO), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	-11 * SIZE(BO), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	-10 * SIZE(BO), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	addq	$ 3*SIZE , BO	
-	addq	$ 4*SIZE, AO
-.endm
-
-.macro KERNEL2x3_SUBN
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	-11 * SIZE(BO), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	-10 * SIZE(BO), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-	vmovsd 	-15 * SIZE(AO), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
-	addq	$ 3*SIZE , BO	
-	addq	$ 2*SIZE, AO
-.endm
-
-.macro KERNEL1x3_SUBN
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	-11 * SIZE(BO), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	-10 * SIZE(BO), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-	addq	$ 3*SIZE , BO	
-	addq	$ 1*SIZE, AO
-.endm
-
-
-
-
-
-
-/******************************************************************************************/
-
-.macro KERNEL16x3_1
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	prefetcht0	64+A_PR1(AO, %rax, SIZE)
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm1
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
-.endm
-
-
-
-
-.macro KERNEL16x3_2
-	prefetcht0	128+A_PR1(AO, %rax, SIZE)
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	prefetcht0	192+A_PR1(AO, %rax, SIZE)
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
-	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
-.endm
-
-.macro KERNEL16x3_3
-	prefetcht0	256+A_PR1(AO, %rax, SIZE)
-	vmovups 	  0 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovups 	  4 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	prefetcht0	320+A_PR1(AO, %rax, SIZE)
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-	vmovups 	  8 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
-	vmovups 	 12 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm1
-	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
-.endm
-
-.macro KERNEL16x3_4
-	prefetcht0	384+A_PR1(AO, %rax, SIZE)
-	vmovups 	 16 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovups 	 20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	prefetcht0	448+A_PR1(AO, %rax, SIZE)
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-	vmovups 	 24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	addq	$12, BI	
-	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
-	vmovups 	 28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-	addq	$64, %rax 
-	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
-.endm
-
-.macro KERNEL16x3_SUB
-	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
-	addq	$3 , BI	
-	addq	$16, %rax 
-.endm
-
-.macro SAVE16x3
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-	vmulpd	%ymm0 , %ymm10, %ymm10
-	vmulpd	%ymm0 , %ymm13, %ymm13
-
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-	vmulpd	%ymm0 , %ymm8 , %ymm8
-	vmulpd	%ymm0 , %ymm11, %ymm11
-	vmulpd	%ymm0 , %ymm14, %ymm14
-
-	vmulpd	%ymm0 , %ymm6 , %ymm6
-	vmulpd	%ymm0 , %ymm9 , %ymm9
-	vmulpd	%ymm0 , %ymm12, %ymm12
-	vmulpd	%ymm0 , %ymm15, %ymm15
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	        (CO1), %ymm4,%ymm4
-	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
-	vaddpd  8 * SIZE(CO1), %ymm10,%ymm10
-	vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
-
-	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
-	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
-	vaddpd  8 * SIZE(CO1, LDC), %ymm11,%ymm11
-	vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14
-
-	vaddpd 	        (CO1, LDC, 2), %ymm6,%ymm6
-	vaddpd  4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9
-	vaddpd  8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12
-	vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm7 , 4 * SIZE(CO1)
-	vmovups	%ymm10, 8 * SIZE(CO1)
-	vmovups	%ymm13,12 * SIZE(CO1)
-
-	vmovups	%ymm5 ,  	(CO1, LDC)
-	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
-	vmovups	%ymm11, 8 * SIZE(CO1, LDC)
-	vmovups	%ymm14,12 * SIZE(CO1, LDC)
-
-	vmovups	%ymm6 ,  	(CO1, LDC, 2)
-	vmovups	%ymm9 , 4 * SIZE(CO1, LDC, 2)
-	vmovups	%ymm12, 8 * SIZE(CO1, LDC, 2)
-	vmovups	%ymm15,12 * SIZE(CO1, LDC, 2)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x3_1
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-.endm
-
-.macro KERNEL8x3_2
-	prefetcht0	64+A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-.endm
-
-.macro KERNEL8x3_3
-	prefetcht0	128+A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-.endm
-
-.macro KERNEL8x3_4
-	prefetcht0	192+A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-	addq	$12, BI
-	addq	$32, %rax
-.endm
-
-.macro KERNEL8x3_SUB
-	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
-	addq	$3 , BI
-	addq	$8 , %rax
-.endm
-
-.macro SAVE8x3
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-	vmulpd	%ymm0 , %ymm8 , %ymm8
-
-	vmulpd	%ymm0 , %ymm6 , %ymm6
-	vmulpd	%ymm0 , %ymm9 , %ymm9
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	        (CO1), %ymm4,%ymm4
-	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
-
-	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
-	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
-
-	vaddpd 	        (CO1, LDC, 2), %ymm6,%ymm6
-	vaddpd  4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm7 , 4 * SIZE(CO1)
-
-	vmovups	%ymm5 ,  	(CO1, LDC)
-	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
-
-	vmovups	%ymm6 ,  	(CO1, LDC, 2)
-	vmovups	%ymm9 , 4 * SIZE(CO1, LDC, 2)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x3_1
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-.endm
-
-.macro KERNEL4x3_2
-	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-.endm
-
-.macro KERNEL4x3_3
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-.endm
-
-.macro KERNEL4x3_4
-	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	addq	$12, BI
-	addq	$16, %rax
-.endm
-
-.macro KERNEL4x3_SUB
-	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
-	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
-	addq	$3 , BI
-	addq	$4 , %rax
-.endm
-
-.macro SAVE4x3
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-	vmulpd	%ymm0 , %ymm6 , %ymm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	        (CO1), %ymm4,%ymm4
-	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
-	vaddpd 	        (CO1, LDC, 2), %ymm6,%ymm6
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 ,  	(CO1, LDC)
-	vmovups	%ymm6 ,  	(CO1, LDC, 2)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x3_1
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
-.endm
-
-.macro KERNEL2x3_2
-	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
-.endm
-
-.macro KERNEL2x3_3
-	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
-.endm
-
-.macro KERNEL2x3_4
-	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	  4 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	  5 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
-	addq	$12, BI
-	addq	$8, %rax
-.endm
-
-.macro KERNEL2x3_SUB
-	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
-	addq	$3 , BI
-	addq	$2 , %rax
-.endm
-
-.macro SAVE2x3
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm8 , %xmm8
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-	vmulsd	%xmm0 , %xmm10, %xmm10
-	vmulsd	%xmm0 , %xmm6 , %xmm6
-	vmulsd	%xmm0 , %xmm12, %xmm12
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	 (CO1), %xmm4,%xmm4
-	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
-	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
-	vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10
-	vaddsd 	 (CO1, LDC, 2), %xmm6,%xmm6
-	vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm8 , 1 * SIZE(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm10, 1 * SIZE(CO1, LDC)
-	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
-	vmovsd	%xmm12, 1 * SIZE(CO1, LDC, 2)
-
-.endm
-
-/*******************************************************************************************/
-
-.macro KERNEL1x3_1
-	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-.endm
-
-.macro KERNEL1x3_2
-	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-.endm
-
-.macro KERNEL1x3_3
-	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-.endm
-
-.macro KERNEL1x3_4
-	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	  4 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	  5 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-	addq	$12, BI
-	addq	$4, %rax
-.endm
-
-.macro KERNEL1x3_SUB
-	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
-	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
-	addq	$3 , BI
-	addq	$1 , %rax
-.endm
-
-.macro SAVE1x3
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-	vmulsd	%xmm0 , %xmm6 , %xmm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	 (CO1), %xmm4,%xmm4
-	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
-	vaddsd 	 (CO1, LDC, 2), %xmm6,%xmm6
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
-
-.endm
-
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 2 lines of N
-*******************************************************************************************/
-
-.macro KERNEL16x2_1
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	prefetcht0	64+A_PR1(AO, %rax, SIZE)
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-.endm
-
-.macro KERNEL16x2_2
-	prefetcht0	128+A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	prefetcht0	192+A_PR1(AO, %rax, SIZE)
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-.endm
-
-.macro KERNEL16x2_3
-	prefetcht0	256+A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	  0 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vmovups 	  4 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	prefetcht0	320+A_PR1(AO, %rax, SIZE)
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	vmovups 	  8 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	vmovups 	 12 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-.endm
-
-.macro KERNEL16x2_4
-	prefetcht0	384+A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	 16 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vmovups 	 20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	prefetcht0	448+A_PR1(AO, %rax, SIZE)
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	vmovups 	 24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	vmovups 	 28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-	addq	$8, BI
-	addq	$64, %rax
-.endm
-
-.macro KERNEL16x2_SUB
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
-	addq	$2, BI
-	addq	$16, %rax
-.endm
-
-.macro SAVE16x2
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-	vmulpd	%ymm0 , %ymm10, %ymm10
-	vmulpd	%ymm0 , %ymm13, %ymm13
-
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-	vmulpd	%ymm0 , %ymm8 , %ymm8
-	vmulpd	%ymm0 , %ymm11, %ymm11
-	vmulpd	%ymm0 , %ymm14, %ymm14
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	        (CO1), %ymm4,%ymm4
-	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
-	vaddpd  8 * SIZE(CO1), %ymm10,%ymm10
-	vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
-
-	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
-	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
-	vaddpd  8 * SIZE(CO1, LDC), %ymm11,%ymm11
-	vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm7 , 4 * SIZE(CO1)
-	vmovups	%ymm10, 8 * SIZE(CO1)
-	vmovups	%ymm13,12 * SIZE(CO1)
-
-	vmovups	%ymm5 ,  	(CO1, LDC)
-	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
-	vmovups	%ymm11, 8 * SIZE(CO1, LDC)
-	vmovups	%ymm14,12 * SIZE(CO1, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x2_1
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-.endm
-
-.macro KERNEL8x2_2
-	prefetcht0	64+A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-.endm
-
-.macro KERNEL8x2_3
-	prefetcht0	128+A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-.endm
-
-.macro KERNEL8x2_4
-	prefetcht0	192+A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	addq	$8, BI				 
-	addq	$32, %rax 			 
-.endm
-
-.macro KERNEL8x2_SUB
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
-	addq	$2, BI				 
-	addq	$8 , %rax 			 
-.endm
-
-.macro SAVE8x2
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-	vmulpd	%ymm0 , %ymm8 , %ymm8
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	        (CO1), %ymm4,%ymm4
-	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
-
-	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
-	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm7 , 4 * SIZE(CO1)
-
-	vmovups	%ymm5 ,  	(CO1, LDC)
-	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x2_1
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-.endm
-
-.macro KERNEL4x2_2
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-.endm
-
-.macro KERNEL4x2_3
-	prefetcht0	64+A_PR1(AO, %rax, SIZE)
-	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-.endm
-
-.macro KERNEL4x2_4
-	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	addq	$8, BI				 
-	addq	$16, %rax 			 
-.endm
-
-.macro KERNEL4x2_SUB
-	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
-	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
-	addq	$2, BI				 
-	addq	$4 , %rax 			 
-.endm
-
-.macro SAVE4x2
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	        (CO1), %ymm4,%ymm4
-	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 ,  	(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x2_1
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-.endm
-
-.macro KERNEL2x2_2
-	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-.endm
-
-.macro KERNEL2x2_3
-	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-.endm
-
-.macro KERNEL2x2_4
-	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-	addq	$8, BI				 
-	addq	$8, %rax 			 
-.endm
-
-.macro KERNEL2x2_SUB
-	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
-	addq	$2, BI				 
-	addq	$2, %rax 			 
-.endm
-
-.macro SAVE2x2
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm8 , %xmm8
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-	vmulsd	%xmm0 , %xmm10, %xmm10
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	 (CO1), %xmm4,%xmm4
-	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
-	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
-	vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm8 , 1 * SIZE(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm10, 1 * SIZE(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x2_1
-	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-.endm
-
-.macro KERNEL1x2_2
-	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-.endm
-
-.macro KERNEL1x2_3
-	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-.endm
-
-.macro KERNEL1x2_4
-	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	addq	$8, BI				 
-	addq	$4, %rax 			 
-.endm
-
-.macro KERNEL1x2_SUB
-	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
-	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
-	addq	$2, BI				 
-	addq	$1, %rax 			 
-.endm
-
-.macro SAVE1x2
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	 (CO1), %xmm4,%xmm4
-	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 1 line of N
-*******************************************************************************************/
-
-.macro KERNEL16x1_1
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-.endm
-
-.macro KERNEL16x1_2
-	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-.endm
-
-.macro KERNEL16x1_3
-	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	  0 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vmovups 	  4 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	vmovups 	  8 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	vmovups 	 12 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-.endm
-
-.macro KERNEL16x1_4
-	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	 16 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vmovups 	 20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	vmovups 	 24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	vmovups 	 28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	addq	$4, BI				 
-	addq	$64, %rax 			 
-.endm
-
-.macro KERNEL16x1_SUB
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
-	addq	$1, BI				 
-	addq	$16, %rax 			 
-.endm
-
-.macro SAVE16x1
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-	vmulpd	%ymm0 , %ymm10, %ymm10
-	vmulpd	%ymm0 , %ymm13, %ymm13
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	        (CO1), %ymm4,%ymm4
-	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
-	vaddpd  8 * SIZE(CO1), %ymm10,%ymm10
-	vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm7 , 4 * SIZE(CO1)
-	vmovups	%ymm10, 8 * SIZE(CO1)
-	vmovups	%ymm13,12 * SIZE(CO1)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x1_1
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-.endm
-
-.macro KERNEL8x1_2
-	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-.endm
-
-.macro KERNEL8x1_3
-	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-.endm
-
-.macro KERNEL8x1_4
-	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	addq	$4, BI				 
-	addq	$32, %rax 			 
-.endm
-
-.macro KERNEL8x1_SUB
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
-	addq	$1, BI				 
-	addq	$8 , %rax 			 
-.endm
-
-.macro SAVE8x1
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	        (CO1), %ymm4,%ymm4
-	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm7 , 4 * SIZE(CO1)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x1_1
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-.endm
-
-.macro KERNEL4x1_2
-	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-.endm
-
-.macro KERNEL4x1_3
-	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-.endm
-
-.macro KERNEL4x1_4
-	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	addq	$4, BI				 
-	addq	$16, %rax 			 
-.endm
-
-.macro KERNEL4x1_SUB
-	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
-	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
-	addq	$1, BI				 
-	addq	$4 , %rax 			 
-.endm
-
-.macro SAVE4x1
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	        (CO1), %ymm4,%ymm4
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x1_1
-	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-.endm
-
-.macro KERNEL2x1_2
-	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-.endm
-
-.macro KERNEL2x1_3
-	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-.endm
-
-.macro KERNEL2x1_4
-	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	addq	$4, BI				 
-	addq	$8, %rax 			 
-.endm
-
-.macro KERNEL2x1_SUB
-	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
-	addq	$1, BI				 
-	addq	$2 , %rax 			 
-.endm
-
-.macro SAVE2x1
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm8 , %xmm8
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	 (CO1), %xmm4,%xmm4
-	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm8 , 1 * SIZE(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x1_1
-	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-.endm
-
-.macro KERNEL1x1_2
-	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-.endm
-
-.macro KERNEL1x1_3
-	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-.endm
-
-.macro KERNEL1x1_4
-	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	addq	$ 4, BI				 
-	addq	$ 4, %rax 			 
-.endm
-
-.macro KERNEL1x1_SUB
-	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
-	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
-	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
-	addq	$ 1, BI				 
-	addq	$ 1 , %rax 			 
-.endm
-
-.macro SAVE1x1
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	 (CO1), %xmm4,%xmm4
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-#if !defined(TRMMKERNEL)
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $6,  %rdi
-        divq    %rdi                    //    N / 6
-        movq    %rax, Ndiv6             //    N / 6
-        movq    %rdx, Nmod6             //    N % 6
-
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L2_0
-	ALIGN_4
-
-.L6_01:
-        // copy to sub buffer
-        movq    K, %rax
-        salq    $1,%rax                 // K * 2 ; read 2 values
-        movq    B, BO1
-        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
-        leaq    BUFFER1, BO             // first buffer to BO
-        movq    K, %rax
-	sarq	$3 , %rax		// K / 8
-	jz	.L6_01a_2
-        ALIGN_4
-
-.L6_01a_1:
-
-        prefetcht0 512(BO1)
-        prefetcht0 512(BO2)
-        prefetchw  512(BO)
-
-
-	vmovups	0 * SIZE(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm2
-	vmovups	4 * SIZE(BO1), %xmm4
-	vmovups	6 * SIZE(BO1), %xmm6
-	vmovsd  0 * SIZE(BO2), %xmm1
-	vmovsd  2 * SIZE(BO2), %xmm3
-	vmovsd  4 * SIZE(BO2), %xmm5
-	vmovsd  6 * SIZE(BO2), %xmm7
-	vmovups	%xmm0, 0*SIZE(BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	vmovups	%xmm2, 3*SIZE(BO)
-	vmovsd	%xmm3, 5*SIZE(BO)
-	vmovups	%xmm4, 6*SIZE(BO)
-	vmovsd	%xmm5, 8*SIZE(BO)
-	vmovups	%xmm6, 9*SIZE(BO)
-	vmovsd	%xmm7,11*SIZE(BO)
-	addq	$ 8*SIZE,BO1
-	addq	$ 8*SIZE,BO2
-	addq	$ 12*SIZE,BO
-
-	vmovups	0 * SIZE(BO1), %xmm0
-	vmovups	2 * SIZE(BO1), %xmm2
-	vmovups	4 * SIZE(BO1), %xmm4
-	vmovups	6 * SIZE(BO1), %xmm6
-	vmovsd  0 * SIZE(BO2), %xmm1
-	vmovsd  2 * SIZE(BO2), %xmm3
-	vmovsd  4 * SIZE(BO2), %xmm5
-	vmovsd  6 * SIZE(BO2), %xmm7
-	vmovups	%xmm0, 0*SIZE(BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	vmovups	%xmm2, 3*SIZE(BO)
-	vmovsd	%xmm3, 5*SIZE(BO)
-	vmovups	%xmm4, 6*SIZE(BO)
-	vmovsd	%xmm5, 8*SIZE(BO)
-	vmovups	%xmm6, 9*SIZE(BO)
-	vmovsd	%xmm7,11*SIZE(BO)
-	addq	$ 8*SIZE,BO1
-	addq	$ 8*SIZE,BO2
-	addq	$ 12*SIZE,BO
-
-	decq	%rax
-	jnz	.L6_01a_1
-
-
-
-.L6_01a_2:
-
-	movq    K, %rax
-        andq    $7, %rax                // K % 8
-        jz      .L6_02c
-        ALIGN_4
-
-
-.L6_02b:
-
-	vmovups	0 * SIZE(BO1), %xmm0
-	vmovsd  0 * SIZE(BO2), %xmm2
-	vmovups	%xmm0, 0*SIZE(BO)
-	vmovsd	%xmm2, 2*SIZE(BO)
-	addq	$ 2*SIZE,BO1
-	addq	$ 2*SIZE,BO2
-	addq	$ 3*SIZE,BO
-	decq	%rax
-	jnz	.L6_02b
-
-.L6_02c:
-
-	movq	K, %rax
-	salq	$1,%rax			// K * 2
-	leaq	(B,%rax, SIZE), BO1	// next offset to BO1
-	leaq	(BO1,%rax, SIZE), BO2	// next offset to BO2
-	leaq    BUFFER2, BO		// second buffer to BO
-	movq	K, %rax
-	sarq	$3 , %rax		// K / 8
-	jz	.L6_02c_2
-	ALIGN_4
-
-.L6_02c_1:
-
-	prefetcht0 512(BO2)
-        prefetchw  512(BO)
-
-	vmovups	0 * SIZE(BO2), %xmm0
-	vmovups	2 * SIZE(BO2), %xmm2
-	vmovups	4 * SIZE(BO2), %xmm4
-	vmovups	6 * SIZE(BO2), %xmm6
-	vmovsd  1 * SIZE(BO1), %xmm1
-	vmovsd  3 * SIZE(BO1), %xmm3
-	vmovsd  5 * SIZE(BO1), %xmm5
-	vmovsd  7 * SIZE(BO1), %xmm7
-	vmovsd	%xmm1, 0*SIZE(BO)
-	vmovups	%xmm0, 1*SIZE(BO)
-	vmovsd	%xmm3, 3*SIZE(BO)
-	vmovups	%xmm2, 4*SIZE(BO)
-	vmovsd	%xmm5, 6*SIZE(BO)
-	vmovups	%xmm4, 7*SIZE(BO)
-	vmovsd	%xmm7, 9*SIZE(BO)
-	vmovups	%xmm6,10*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-
-
-	vmovups	0 * SIZE(BO2), %xmm0
-	vmovups	2 * SIZE(BO2), %xmm2
-	vmovups	4 * SIZE(BO2), %xmm4
-	vmovups	6 * SIZE(BO2), %xmm6
-	vmovsd  1 * SIZE(BO1), %xmm1
-	vmovsd  3 * SIZE(BO1), %xmm3
-	vmovsd  5 * SIZE(BO1), %xmm5
-	vmovsd  7 * SIZE(BO1), %xmm7
-	vmovsd	%xmm1, 0*SIZE(BO)
-	vmovups	%xmm0, 1*SIZE(BO)
-	vmovsd	%xmm3, 3*SIZE(BO)
-	vmovups	%xmm2, 4*SIZE(BO)
-	vmovsd	%xmm5, 6*SIZE(BO)
-	vmovups	%xmm4, 7*SIZE(BO)
-	vmovsd	%xmm7, 9*SIZE(BO)
-	vmovups	%xmm6,10*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-
-	decq	%rax
-	jnz	.L6_02c_1
-
-
-.L6_02c_2:
-
-	movq    K, %rax
-        andq    $7, %rax                // K % 8
-        jz      .L6_03c
-        ALIGN_4
-
-.L6_03b:
-
-	vmovsd	  1*SIZE(BO1), %xmm0
-	vmovups	  0*SIZE(BO2), %xmm1
-	vmovsd	%xmm0, 0*SIZE(BO)
-	vmovups	%xmm1, 1*SIZE(BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO2
-	addq	$3*SIZE,BO
-	decq	%rax
-	jnz	.L6_03b
-
-
-.L6_03c:
-
-	movq	BO2, B			// next offset of B
-
-.L6_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		 
-	leaq	(C, LDC, 1), C		// c += 3 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L6_20
-
-	ALIGN_4
-
-.L6_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	prefetcht0	(CO1)
-	prefetcht0	(CO1,LDC,1)
-	prefetcht0	(CO1,LDC,2)
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1,LDC,1)
-	prefetcht0	64(CO1,LDC,2)
-
-	vzeroall
-
-        movq    K, %rax
-
-	sarq $1, %rax			//  K / 8
-	je	.L6_16
-
-	ALIGN_5
-
-.L6_12:
-/*
-	prefetcht0	B_PR1(BO)
-	prefetcht0	B_PR1+64(BO)
-	prefetcht0	B_PR1+128(BO)
-*/
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-/*
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-*/
-	dec	%rax
-	jne	.L6_12
-
-.L6_16:
-        movq    K, %rax
-
-	andq	$1, %rax		# if (k & 1)
-	je .L6_19
-
-	ALIGN_4
-
-.L6_17:
-
-	KERNEL16x3_SUBN
-
-	dec	%rax
-	jne	.L6_17
-	ALIGN_4
-
-
-.L6_19:
-
-	SAVE16x3
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L6_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L6_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L7_10		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L6_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L6_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L6_20_6
-
-	ALIGN_4
-
-.L6_20_2:
-
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-	dec	%rax
-	jne	.L6_20_2
-	ALIGN_4
-
-.L6_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_20_9
-
-
-	ALIGN_4
-
-.L6_20_7:
-
-	KERNEL8x3_SUBN
-
-	dec	%rax
-	jne	.L6_20_7
-	ALIGN_4
-
-
-.L6_20_9:
-
-	SAVE8x3
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L6_21pre:
-
-	testq	$4, M		
-	jz	.L6_30
-	ALIGN_4
-
-.L6_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L6_26
-
-	ALIGN_4
-
-.L6_22:
-
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-	dec	%rax
-	jne	.L6_22
-	ALIGN_4
-
-.L6_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_29
-
-	ALIGN_4
-
-.L6_27:
-
-	KERNEL4x3_SUBN
-
-	dec %rax
-	jne	.L6_27
-	ALIGN_4
-
-
-.L6_29:
-
-	SAVE4x3
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L6_30:
-	testq	$2, M		
-	jz	.L6_40
-
-	ALIGN_4
-
-.L6_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L6_36
-	ALIGN_4
-
-.L6_32:
-
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-	dec %rax
-	jne	.L6_32
-	ALIGN_4
-
-.L6_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_39
-
-	ALIGN_4
-
-.L6_37:
-
-	KERNEL2x3_SUBN
-
-	dec %rax
-	jne	.L6_37
-	ALIGN_4
-
-
-.L6_39:
-
-	SAVE2x3
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L6_40:
-	testq	$1, M		
-	jz	.L7_10		// to next 3 lines of N
-
-	ALIGN_4
-
-.L6_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L6_46
-
-	ALIGN_4
-
-.L6_42:
-
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-
-	dec %rax
-	jne	.L6_42
-	ALIGN_4
-
-.L6_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_49
-
-	ALIGN_4
-
-.L6_47:
-
-	KERNEL1x3_SUBN
-
-	dec	%rax
-	jne	.L6_47
-	ALIGN_4
-
-
-.L6_49:
-
-	SAVE1x3
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-/***************************************************************************************************************/
-
-.L7_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		 
-	leaq	(C, LDC, 1), C		// c += 3 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L7_20
-
-	ALIGN_4
-
-.L7_11:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $12 * SIZE, BO
-
-	prefetcht0	(CO1)
-	prefetcht0	(CO1,LDC,1)
-	prefetcht0	(CO1,LDC,2)
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1,LDC,1)
-	prefetcht0	64(CO1,LDC,2)
-
-	vzeroall
-
-        movq    K, %rax
-
-	sarq $3, %rax			// K / 8
-	je	.L7_16
-	ALIGN_5
-
-.L7_12:
-/*
-	prefetcht0	B_PR1(BO)
-	prefetcht0	B_PR1+64(BO)
-	prefetcht0	B_PR1+128(BO)
-*/
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-	KERNEL16x3_SUBN
-	dec %rax
-	jne	.L7_12
-	ALIGN_4
-
-.L7_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_19
-
-	ALIGN_5
-
-.L7_17:
-
-	KERNEL16x3_SUBN
-
-	dec	%rax
-	jne	.L7_17
-
-
-.L7_19:
-
-	SAVE16x3
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L7_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L7_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L7_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L7_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L7_20_1:
-        leaq    BUFFER2, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L7_20_6
-
-	ALIGN_4
-
-.L7_20_2:
-
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-	KERNEL8x3_SUBN
-
-	dec %rax
-	jne	.L7_20_2
-	ALIGN_4
-
-.L7_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_20_9
-
-	ALIGN_4
-
-.L7_20_7:
-
-	KERNEL8x3_SUBN
-
-	dec %rax
-	jne	.L7_20_7
-	ALIGN_4
-
-.L7_20_9:
-
-	SAVE8x3
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L7_21pre:
-
-	testq	$4, M		
-	jz	.L7_30
-	ALIGN_4
-
-.L7_21:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $12 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L7_26
-
-	ALIGN_4
-
-.L7_22:
-
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-	KERNEL4x3_SUBN
-
-	dec %rax
-	jne	.L7_22
-	ALIGN_4
-
-.L7_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_29
-
-	ALIGN_4
-
-.L7_27:
-
-	KERNEL4x3_SUBN
-
-	dec %rax
-	jne	.L7_27
-	ALIGN_4
-
-
-.L7_29:
-
-	SAVE4x3
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L7_30:
-	testq	$2, M		
-	jz	.L7_40
-
-	ALIGN_4
-
-.L7_31:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $12 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L7_36
-
-	ALIGN_4
-
-.L7_32:
-
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-	KERNEL2x3_SUBN
-
-	dec %rax
-	jne	.L7_32
-	ALIGN_4
-
-.L7_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_39
-
-	ALIGN_4
-
-.L7_37:
-
-	KERNEL2x3_SUBN
-
-	dec %rax
-	jne	.L7_37
-	ALIGN_4
-
-
-.L7_39:
-
-	SAVE2x3
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L7_40:
-	testq	$1, M		
-	jz	.L7_60		// to next 3 lines of N
-
-	ALIGN_4
-
-.L7_41:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $12 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L7_46
-
-	ALIGN_4
-
-.L7_42:
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-	KERNEL1x3_SUBN
-
-	dec %rax
-	jne	.L7_42
-	ALIGN_4
-
-.L7_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_49
-
-	ALIGN_4
-
-.L7_47:
-
-	KERNEL1x3_SUBN
-
-	dec %rax
-	jne	.L7_47
-	ALIGN_4
-
-
-.L7_49:
-
-	SAVE1x3
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-.L7_60:
-
-	decq	J			// j --
-	jg	.L6_01
-
-
-.L2_0:
-	cmpq	$0, Nmod6		// N % 6 == 0
-	je	.L999
-
-/************************************************************************************************
-* Loop for Nmod6 / 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	sarq	$1, J			// j = j / 2
-	je	.L1_0
-	ALIGN_4
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L2_01b
-	ALIGN_4
-
-.L2_01a:
-        prefetcht0 512(BO1)
-        prefetchw  512(BO)
-
-	vmovups	      (BO1), %xmm0
-	vmovups	2*SIZE(BO1), %xmm1
-	vmovups	4*SIZE(BO1), %xmm2
-	vmovups	6*SIZE(BO1), %xmm3
-
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 2*SIZE(BO)
-	vmovups	%xmm2, 4*SIZE(BO)
-	vmovups	%xmm3, 6*SIZE(BO)
-
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO
-	decq	%rax
-	jnz	.L2_01a
-
-
-.L2_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L2_02d
-        ALIGN_4
-
-.L2_02c:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0, (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02c
-
-.L2_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x2_1
-	KERNEL16x2_2
-	KERNEL16x2_3
-	KERNEL16x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x2_1
-	KERNEL16x2_2
-	KERNEL16x2_3
-	KERNEL16x2_4
-
-	je	.L2_16
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x2_1
-	KERNEL16x2_2
-	KERNEL16x2_3
-	KERNEL16x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x2_1
-	KERNEL16x2_2
-	KERNEL16x2_3
-	KERNEL16x2_4
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB
-
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE16x2
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x2_1
-	KERNEL8x2_2
-	KERNEL8x2_3
-	KERNEL8x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x2_1
-	KERNEL8x2_2
-	KERNEL8x2_3
-	KERNEL8x2_4
-
-	je	.L2_20_6
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x2_1
-	KERNEL8x2_2
-	KERNEL8x2_3
-	KERNEL8x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x2_1
-	KERNEL8x2_2
-	KERNEL8x2_3
-	KERNEL8x2_4
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB
-
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	SAVE8x2
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_3
-	KERNEL4x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_3
-	KERNEL4x2_4
-
-	je	.L2_26
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_3
-	KERNEL4x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_3
-	KERNEL4x2_4
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	SAVE4x2
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_3
-	KERNEL2x2_4
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_3
-	KERNEL2x2_4
-
-	je	.L2_36
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_3
-	KERNEL2x2_4
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_3
-	KERNEL2x2_4
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	SAVE2x2
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_3
-	KERNEL1x2_4
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_3
-	KERNEL1x2_4
-
-	je	.L2_46
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_3
-	KERNEL1x2_4
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_3
-	KERNEL1x2_4
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	SAVE1x2
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-.L2_60:
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovsd	(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x1_1
-	KERNEL16x1_2
-	KERNEL16x1_3
-	KERNEL16x1_4
-
-	KERNEL16x1_1
-	KERNEL16x1_2
-	KERNEL16x1_3
-	KERNEL16x1_4
-
-	je	.L1_16
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x1_1
-	KERNEL16x1_2
-	KERNEL16x1_3
-	KERNEL16x1_4
-
-	KERNEL16x1_1
-	KERNEL16x1_2
-	KERNEL16x1_3
-	KERNEL16x1_4
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB
-
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE16x1
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x1_1
-	KERNEL8x1_2
-	KERNEL8x1_3
-	KERNEL8x1_4
-
-	KERNEL8x1_1
-	KERNEL8x1_2
-	KERNEL8x1_3
-	KERNEL8x1_4
-
-	je	.L1_20_6
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x1_1
-	KERNEL8x1_2
-	KERNEL8x1_3
-	KERNEL8x1_4
-
-	KERNEL8x1_1
-	KERNEL8x1_2
-	KERNEL8x1_3
-	KERNEL8x1_4
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB
-
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	SAVE8x1
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_3
-	KERNEL4x1_4
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_3
-	KERNEL4x1_4
-
-	je	.L1_26
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_3
-	KERNEL4x1_4
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_3
-	KERNEL4x1_4
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	SAVE4x1
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_3
-	KERNEL2x1_4
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_3
-	KERNEL2x1_4
-
-	je	.L1_36
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_3
-	KERNEL2x1_4
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_3
-	KERNEL2x1_4
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	SAVE2x1
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_3
-	KERNEL1x1_4
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_3
-	KERNEL1x1_4
-
-	je	.L1_46
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_3
-	KERNEL1x1_4
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_3
-	KERNEL1x1_4
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	SAVE1x1
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#else
-/*************************************************************************************
-* TRMM Kernel
-*************************************************************************************/
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 6
-        movq    %rax, Ndiv6             //    N / 6
-        movq    %rdx, Nmod6             //    N % 6
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L2_01b
-	ALIGN_4
-
-.L2_01a:
-        prefetcht0 512(BO1)
-        prefetchw  512(BO)
-
-	vmovups	      (BO1), %xmm0
-	vmovups	2*SIZE(BO1), %xmm1
-	vmovups	4*SIZE(BO1), %xmm2
-	vmovups	6*SIZE(BO1), %xmm3
-
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 2*SIZE(BO)
-	vmovups	%xmm2, 4*SIZE(BO)
-	vmovups	%xmm3, 6*SIZE(BO)
-
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO
-	decq	%rax
-	jnz	.L2_01a
-
-
-.L2_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L2_02d
-        ALIGN_4
-
-.L2_02c:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0, (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02c
-
-.L2_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $2, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x2_1
-	KERNEL16x2_2
-	KERNEL16x2_3
-	KERNEL16x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x2_1
-	KERNEL16x2_2
-	KERNEL16x2_3
-	KERNEL16x2_4
-
-	je	.L2_16
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x2_1
-	KERNEL16x2_2
-	KERNEL16x2_3
-	KERNEL16x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x2_1
-	KERNEL16x2_2
-	KERNEL16x2_3
-	KERNEL16x2_4
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB
-
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE16x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x2_1
-	KERNEL8x2_2
-	KERNEL8x2_3
-	KERNEL8x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x2_1
-	KERNEL8x2_2
-	KERNEL8x2_3
-	KERNEL8x2_4
-
-	je	.L2_20_6
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x2_1
-	KERNEL8x2_2
-	KERNEL8x2_3
-	KERNEL8x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x2_1
-	KERNEL8x2_2
-	KERNEL8x2_3
-	KERNEL8x2_4
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB
-
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	SAVE8x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_3
-	KERNEL4x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_3
-	KERNEL4x2_4
-
-	je	.L2_26
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_3
-	KERNEL4x2_4
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x2_1
-	KERNEL4x2_2
-	KERNEL4x2_3
-	KERNEL4x2_4
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_3
-	KERNEL2x2_4
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_3
-	KERNEL2x2_4
-
-	je	.L2_36
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_3
-	KERNEL2x2_4
-
-	KERNEL2x2_1
-	KERNEL2x2_2
-	KERNEL2x2_3
-	KERNEL2x2_4
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_3
-	KERNEL1x2_4
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_3
-	KERNEL1x2_4
-
-	je	.L2_46
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_3
-	KERNEL1x2_4
-
-	KERNEL1x2_1
-	KERNEL1x2_2
-	KERNEL1x2_3
-	KERNEL1x2_4
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovsd	(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $1, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x1_1
-	KERNEL16x1_2
-	KERNEL16x1_3
-	KERNEL16x1_4
-
-	KERNEL16x1_1
-	KERNEL16x1_2
-	KERNEL16x1_3
-	KERNEL16x1_4
-
-	je	.L1_16
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL16x1_1
-	KERNEL16x1_2
-	KERNEL16x1_3
-	KERNEL16x1_4
-
-	KERNEL16x1_1
-	KERNEL16x1_2
-	KERNEL16x1_3
-	KERNEL16x1_4
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB
-
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE16x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x1_1
-	KERNEL8x1_2
-	KERNEL8x1_3
-	KERNEL8x1_4
-
-	KERNEL8x1_1
-	KERNEL8x1_2
-	KERNEL8x1_3
-	KERNEL8x1_4
-
-	je	.L1_20_6
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL8x1_1
-	KERNEL8x1_2
-	KERNEL8x1_3
-	KERNEL8x1_4
-
-	KERNEL8x1_1
-	KERNEL8x1_2
-	KERNEL8x1_3
-	KERNEL8x1_4
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB
-
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	SAVE8x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_3
-	KERNEL4x1_4
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_3
-	KERNEL4x1_4
-
-	je	.L1_26
-
-	prefetcht0      B_PR1(BO,BI,8)
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_3
-	KERNEL4x1_4
-
-	KERNEL4x1_1
-	KERNEL4x1_2
-	KERNEL4x1_3
-	KERNEL4x1_4
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	SAVE4x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_3
-	KERNEL2x1_4
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_3
-	KERNEL2x1_4
-
-	je	.L1_36
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_3
-	KERNEL2x1_4
-
-	KERNEL2x1_1
-	KERNEL2x1_2
-	KERNEL2x1_3
-	KERNEL2x1_4
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_3
-	KERNEL1x1_4
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_3
-	KERNEL1x1_4
-
-	je	.L1_46
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_3
-	KERNEL1x1_4
-
-	KERNEL1x1_1
-	KERNEL1x1_2
-	KERNEL1x1_3
-	KERNEL1x1_4
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-
-
-
-#endif
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+/*********************************************************************
+* 2013/10/20 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+
+*
+*
+* 2013/10/20 Saar
+* Parameter:
+*       DGEMM_DEFAULT_UNROLL_N  2
+*       DGEMM_DEFAULT_UNROLL_M  16
+*       DGEMM_DEFAULT_P         192
+*       DGEMM_DEFAULT_Q         128
+*	A_PR1			512
+*
+*
+* Performance without prefetch of B:
+*       1 thread:       45.8 GFLOPS (MKL:  45)
+*       2 threads:      80.0 GFLOPS (MKL:  91)
+*       4 threads:     135.0 GFLOPS (MKL: 135)
+*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 512*8*4
+#define LB2_OFFSET    512*8*2
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+#define BUFFER2	LB2_OFFSET+128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+#if defined(BULLDOZER)
+
+.macro VFMADD231PD_ y0,y1,y2
+	vfmaddpd \y0,\y1,\y2,\y0
+.endm
+
+.macro VFMADD231SD_ x0,x1,x2
+	vfmaddsd \x0,\x1,\x2,\x0
+.endm
+
+#else
+
+.macro VFMADD231PD_ y0,y1,y2
+	vfmadd231pd \y2,\y1,\y0
+.endm
+
+.macro VFMADD231SD_ x0,x1,x2
+	vfmadd231sd \x2,\x1,\x0
+.endm
+
+#endif
+
+
+#define	A_PR1	512
+#define	B_PR1	256
+
+/*******************************************************************************************
+* 3 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x3_SUBN
+	prefetcht0	A_PR1(AO)
+	vbroadcastsd	-12 * SIZE(BO), %ymm1
+	vmovaps 	-16 * SIZE(AO), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	-11 * SIZE(BO), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	-10 * SIZE(BO), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovaps 	-12 * SIZE(AO), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	A_PR1+64(AO)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	vmovaps 	 -8 * SIZE(AO), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
+	vmovaps 	 -4 * SIZE(AO), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
+	addq	$ 3*SIZE , BO	
+	addq	$ 16*SIZE, AO
+.endm
+
+
+.macro KERNEL8x3_SUBN
+	//prefetcht0	A_PR1(AO)
+	vbroadcastsd	-12 * SIZE(BO), %ymm1
+	vmovaps 	-16 * SIZE(AO), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	-11 * SIZE(BO), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	-10 * SIZE(BO), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovaps 	-12 * SIZE(AO), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	//prefetcht0	A_PR1+64(AO)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	prefetcht0	B_PR1(BO)
+	addq	$ 3*SIZE , BO	
+	addq	$ 8*SIZE, AO
+.endm
+
+.macro KERNEL4x3_SUBN
+	vbroadcastsd	-12 * SIZE(BO), %ymm1
+	vmovaps 	-16 * SIZE(AO), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	-11 * SIZE(BO), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	-10 * SIZE(BO), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	addq	$ 3*SIZE , BO	
+	addq	$ 4*SIZE, AO
+.endm
+
+.macro KERNEL2x3_SUBN
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	-11 * SIZE(BO), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	-10 * SIZE(BO), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-15 * SIZE(AO), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+	addq	$ 3*SIZE , BO	
+	addq	$ 2*SIZE, AO
+.endm
+
+.macro KERNEL1x3_SUBN
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	-11 * SIZE(BO), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	-10 * SIZE(BO), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	addq	$ 3*SIZE , BO	
+	addq	$ 1*SIZE, AO
+.endm
+
+
+
+
+
+
+/******************************************************************************************/
+
+.macro KERNEL16x3_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	64+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm1
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
+.endm
+
+
+
+
+.macro KERNEL16x3_2
+	prefetcht0	128+A_PR1(AO, %rax, SIZE)
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	prefetcht0	192+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
+.endm
+
+.macro KERNEL16x3_3
+	prefetcht0	256+A_PR1(AO, %rax, SIZE)
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	320+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm1
+	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
+.endm
+
+.macro KERNEL16x3_4
+	prefetcht0	384+A_PR1(AO, %rax, SIZE)
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	448+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	addq	$12, BI	
+	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+	addq	$64, %rax 
+	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
+.endm
+
+.macro KERNEL16x3_SUB
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm12,%ymm3,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm15,%ymm3,%ymm0
+	addq	$3 , BI	
+	addq	$16, %rax 
+.endm
+
+.macro SAVE16x3
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+	vmulpd	%ymm0 , %ymm10, %ymm10
+	vmulpd	%ymm0 , %ymm13, %ymm13
+
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+	vmulpd	%ymm0 , %ymm11, %ymm11
+	vmulpd	%ymm0 , %ymm14, %ymm14
+
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+	vmulpd	%ymm0 , %ymm9 , %ymm9
+	vmulpd	%ymm0 , %ymm12, %ymm12
+	vmulpd	%ymm0 , %ymm15, %ymm15
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+	vaddpd  8 * SIZE(CO1), %ymm10,%ymm10
+	vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
+
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
+	vaddpd  8 * SIZE(CO1, LDC), %ymm11,%ymm11
+	vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14
+
+	vaddpd 	        (CO1, LDC, 2), %ymm6,%ymm6
+	vaddpd  4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9
+	vaddpd  8 * SIZE(CO1, LDC, 2), %ymm12,%ymm12
+	vaddpd 12 * SIZE(CO1, LDC, 2), %ymm15,%ymm15
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+	vmovups	%ymm10, 8 * SIZE(CO1)
+	vmovups	%ymm13,12 * SIZE(CO1)
+
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%ymm11, 8 * SIZE(CO1, LDC)
+	vmovups	%ymm14,12 * SIZE(CO1, LDC)
+
+	vmovups	%ymm6 ,  	(CO1, LDC, 2)
+	vmovups	%ymm9 , 4 * SIZE(CO1, LDC, 2)
+	vmovups	%ymm12, 8 * SIZE(CO1, LDC, 2)
+	vmovups	%ymm15,12 * SIZE(CO1, LDC, 2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x3_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+.endm
+
+.macro KERNEL8x3_2
+	prefetcht0	64+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+.endm
+
+.macro KERNEL8x3_3
+	prefetcht0	128+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+.endm
+
+.macro KERNEL8x3_4
+	prefetcht0	192+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	addq	$12, BI
+	addq	$32, %rax
+.endm
+
+.macro KERNEL8x3_SUB
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
+	addq	$3 , BI
+	addq	$8 , %rax
+.endm
+
+.macro SAVE8x3
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+	vmulpd	%ymm0 , %ymm9 , %ymm9
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
+
+	vaddpd 	        (CO1, LDC, 2), %ymm6,%ymm6
+	vaddpd  4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
+
+	vmovups	%ymm6 ,  	(CO1, LDC, 2)
+	vmovups	%ymm9 , 4 * SIZE(CO1, LDC, 2)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x3_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+.endm
+
+.macro KERNEL4x3_2
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+.endm
+
+.macro KERNEL4x3_3
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+.endm
+
+.macro KERNEL4x3_4
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	addq	$12, BI
+	addq	$16, %rax
+.endm
+
+.macro KERNEL4x3_SUB
+	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
+	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
+	addq	$3 , BI
+	addq	$4 , %rax
+.endm
+
+.macro SAVE4x3
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+	vaddpd 	        (CO1, LDC, 2), %ymm6,%ymm6
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm6 ,  	(CO1, LDC, 2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x3_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+.endm
+
+.macro KERNEL2x3_2
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+.endm
+
+.macro KERNEL2x3_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+.endm
+
+.macro KERNEL2x3_4
+	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	  5 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+	addq	$12, BI
+	addq	$8, %rax
+.endm
+
+.macro KERNEL2x3_SUB
+	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
+	addq	$3 , BI
+	addq	$2 , %rax
+.endm
+
+.macro SAVE2x3
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm10, %xmm10
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+	vmulsd	%xmm0 , %xmm12, %xmm12
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
+	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
+	vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10
+	vaddsd 	 (CO1, LDC, 2), %xmm6,%xmm6
+	vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm8 , 1 * SIZE(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm10, 1 * SIZE(CO1, LDC)
+	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
+	vmovsd	%xmm12, 1 * SIZE(CO1, LDC, 2)
+
+.endm
+
+/*******************************************************************************************/
+
+.macro KERNEL1x3_1
+	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+.endm
+
+.macro KERNEL1x3_2
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+.endm
+
+.macro KERNEL1x3_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+.endm
+
+.macro KERNEL1x3_4
+	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  4 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	  5 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	addq	$12, BI
+	addq	$4, %rax
+.endm
+
+.macro KERNEL1x3_SUB
+	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
+	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
+	addq	$3 , BI
+	addq	$1 , %rax
+.endm
+
+.macro SAVE1x3
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
+	vaddsd 	 (CO1, LDC, 2), %xmm6,%xmm6
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 2 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x2_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	64+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+.endm
+
+.macro KERNEL16x2_2
+	prefetcht0	128+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	192+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+.endm
+
+.macro KERNEL16x2_3
+	prefetcht0	256+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	320+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+.endm
+
+.macro KERNEL16x2_4
+	prefetcht0	384+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	prefetcht0	448+A_PR1(AO, %rax, SIZE)
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+	addq	$8, BI
+	addq	$64, %rax
+.endm
+
+.macro KERNEL16x2_SUB
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
+	addq	$2, BI
+	addq	$16, %rax
+.endm
+
+.macro SAVE16x2
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+	vmulpd	%ymm0 , %ymm10, %ymm10
+	vmulpd	%ymm0 , %ymm13, %ymm13
+
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+	vmulpd	%ymm0 , %ymm11, %ymm11
+	vmulpd	%ymm0 , %ymm14, %ymm14
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+	vaddpd  8 * SIZE(CO1), %ymm10,%ymm10
+	vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
+
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
+	vaddpd  8 * SIZE(CO1, LDC), %ymm11,%ymm11
+	vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+	vmovups	%ymm10, 8 * SIZE(CO1)
+	vmovups	%ymm13,12 * SIZE(CO1)
+
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%ymm11, 8 * SIZE(CO1, LDC)
+	vmovups	%ymm14,12 * SIZE(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x2_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+.endm
+
+.macro KERNEL8x2_2
+	prefetcht0	64+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+.endm
+
+.macro KERNEL8x2_3
+	prefetcht0	128+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+.endm
+
+.macro KERNEL8x2_4
+	prefetcht0	192+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	addq	$8, BI				 
+	addq	$32, %rax 			 
+.endm
+
+.macro KERNEL8x2_SUB
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
+	addq	$2, BI				 
+	addq	$8 , %rax 			 
+.endm
+
+.macro SAVE8x2
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x2_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+.endm
+
+.macro KERNEL4x2_2
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+.endm
+
+.macro KERNEL4x2_3
+	prefetcht0	64+A_PR1(AO, %rax, SIZE)
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+.endm
+
+.macro KERNEL4x2_4
+	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	addq	$8, BI				 
+	addq	$16, %rax 			 
+.endm
+
+.macro KERNEL4x2_SUB
+	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
+	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
+	addq	$2, BI				 
+	addq	$4 , %rax 			 
+.endm
+
+.macro SAVE4x2
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x2_1
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+.endm
+
+.macro KERNEL2x2_2
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+.endm
+
+.macro KERNEL2x2_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+.endm
+
+.macro KERNEL2x2_4
+	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	addq	$8, BI				 
+	addq	$8, %rax 			 
+.endm
+
+.macro KERNEL2x2_SUB
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
+	addq	$2, BI				 
+	addq	$2, %rax 			 
+.endm
+
+.macro SAVE2x2
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm10, %xmm10
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
+	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
+	vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm8 , 1 * SIZE(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm10, 1 * SIZE(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x2_1
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+.endm
+
+.macro KERNEL1x2_2
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+.endm
+
+.macro KERNEL1x2_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+.endm
+
+.macro KERNEL1x2_4
+	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	addq	$8, BI				 
+	addq	$4, %rax 			 
+.endm
+
+.macro KERNEL1x2_SUB
+	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
+	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
+	addq	$2, BI				 
+	addq	$1, %rax 			 
+.endm
+
+.macro SAVE1x2
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 1 line of N
+*******************************************************************************************/
+
+.macro KERNEL16x1_1
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+.endm
+
+.macro KERNEL16x1_2
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+.endm
+
+.macro KERNEL16x1_3
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+.endm
+
+.macro KERNEL16x1_4
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	addq	$4, BI				 
+	addq	$64, %rax 			 
+.endm
+
+.macro KERNEL16x1_SUB
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
+	addq	$1, BI				 
+	addq	$16, %rax 			 
+.endm
+
+.macro SAVE16x1
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+	vmulpd	%ymm0 , %ymm10, %ymm10
+	vmulpd	%ymm0 , %ymm13, %ymm13
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+	vaddpd  8 * SIZE(CO1), %ymm10,%ymm10
+	vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+	vmovups	%ymm10, 8 * SIZE(CO1)
+	vmovups	%ymm13,12 * SIZE(CO1)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x1_1
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+.endm
+
+.macro KERNEL8x1_2
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+.endm
+
+.macro KERNEL8x1_3
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+.endm
+
+.macro KERNEL8x1_4
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	addq	$4, BI				 
+	addq	$32, %rax 			 
+.endm
+
+.macro KERNEL8x1_SUB
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
+	addq	$1, BI				 
+	addq	$8 , %rax 			 
+.endm
+
+.macro SAVE8x1
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm7 , 4 * SIZE(CO1)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x1_1
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+.endm
+
+.macro KERNEL4x1_2
+	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+.endm
+
+.macro KERNEL4x1_3
+	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+.endm
+
+.macro KERNEL4x1_4
+	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	addq	$4, BI				 
+	addq	$16, %rax 			 
+.endm
+
+.macro KERNEL4x1_SUB
+	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
+	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
+	addq	$1, BI				 
+	addq	$4 , %rax 			 
+.endm
+
+.macro SAVE4x1
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	        (CO1), %ymm4,%ymm4
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x1_1
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+.endm
+
+.macro KERNEL2x1_2
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+.endm
+
+.macro KERNEL2x1_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+.endm
+
+.macro KERNEL2x1_4
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	addq	$4, BI				 
+	addq	$8, %rax 			 
+.endm
+
+.macro KERNEL2x1_SUB
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
+	addq	$1, BI				 
+	addq	$2 , %rax 			 
+.endm
+
+.macro SAVE2x1
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm8 , 1 * SIZE(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x1_1
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+.endm
+
+.macro KERNEL1x1_2
+	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+.endm
+
+.macro KERNEL1x1_3
+	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+.endm
+
+.macro KERNEL1x1_4
+	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	addq	$ 4, BI				 
+	addq	$ 4, %rax 			 
+.endm
+
+.macro KERNEL1x1_SUB
+	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
+	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
+	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
+	addq	$ 1, BI				 
+	addq	$ 1 , %rax 			 
+.endm
+
+.macro SAVE1x1
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	 (CO1), %xmm4,%xmm4
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $6,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L2_0
+	ALIGN_4
+
+.L6_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $1,%rax                 // K * 2 ; read 2 values
+        movq    B, BO1
+        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+	sarq	$3 , %rax		// K / 8
+	jz	.L6_01a_2
+        ALIGN_4
+
+.L6_01a_1:
+
+        prefetcht0 512(BO1)
+        prefetcht0 512(BO2)
+        prefetchw  512(BO)
+
+
+	vmovups	0 * SIZE(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm2
+	vmovups	4 * SIZE(BO1), %xmm4
+	vmovups	6 * SIZE(BO1), %xmm6
+	vmovsd  0 * SIZE(BO2), %xmm1
+	vmovsd  2 * SIZE(BO2), %xmm3
+	vmovsd  4 * SIZE(BO2), %xmm5
+	vmovsd  6 * SIZE(BO2), %xmm7
+	vmovups	%xmm0, 0*SIZE(BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovups	%xmm2, 3*SIZE(BO)
+	vmovsd	%xmm3, 5*SIZE(BO)
+	vmovups	%xmm4, 6*SIZE(BO)
+	vmovsd	%xmm5, 8*SIZE(BO)
+	vmovups	%xmm6, 9*SIZE(BO)
+	vmovsd	%xmm7,11*SIZE(BO)
+	addq	$ 8*SIZE,BO1
+	addq	$ 8*SIZE,BO2
+	addq	$ 12*SIZE,BO
+
+	vmovups	0 * SIZE(BO1), %xmm0
+	vmovups	2 * SIZE(BO1), %xmm2
+	vmovups	4 * SIZE(BO1), %xmm4
+	vmovups	6 * SIZE(BO1), %xmm6
+	vmovsd  0 * SIZE(BO2), %xmm1
+	vmovsd  2 * SIZE(BO2), %xmm3
+	vmovsd  4 * SIZE(BO2), %xmm5
+	vmovsd  6 * SIZE(BO2), %xmm7
+	vmovups	%xmm0, 0*SIZE(BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovups	%xmm2, 3*SIZE(BO)
+	vmovsd	%xmm3, 5*SIZE(BO)
+	vmovups	%xmm4, 6*SIZE(BO)
+	vmovsd	%xmm5, 8*SIZE(BO)
+	vmovups	%xmm6, 9*SIZE(BO)
+	vmovsd	%xmm7,11*SIZE(BO)
+	addq	$ 8*SIZE,BO1
+	addq	$ 8*SIZE,BO2
+	addq	$ 12*SIZE,BO
+
+	decq	%rax
+	jnz	.L6_01a_1
+
+
+
+.L6_01a_2:
+
+	movq    K, %rax
+        andq    $7, %rax                // K % 8
+        jz      .L6_02c
+        ALIGN_4
+
+
+.L6_02b:
+
+	vmovups	0 * SIZE(BO1), %xmm0
+	vmovsd  0 * SIZE(BO2), %xmm2
+	vmovups	%xmm0, 0*SIZE(BO)
+	vmovsd	%xmm2, 2*SIZE(BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO2
+	addq	$ 3*SIZE,BO
+	decq	%rax
+	jnz	.L6_02b
+
+.L6_02c:
+
+	movq	K, %rax
+	salq	$1,%rax			// K * 2
+	leaq	(B,%rax, SIZE), BO1	// next offset to BO1
+	leaq	(BO1,%rax, SIZE), BO2	// next offset to BO2
+	leaq    BUFFER2, BO		// second buffer to BO
+	movq	K, %rax
+	sarq	$3 , %rax		// K / 8
+	jz	.L6_02c_2
+	ALIGN_4
+
+.L6_02c_1:
+
+	prefetcht0 512(BO2)
+        prefetchw  512(BO)
+
+	vmovups	0 * SIZE(BO2), %xmm0
+	vmovups	2 * SIZE(BO2), %xmm2
+	vmovups	4 * SIZE(BO2), %xmm4
+	vmovups	6 * SIZE(BO2), %xmm6
+	vmovsd  1 * SIZE(BO1), %xmm1
+	vmovsd  3 * SIZE(BO1), %xmm3
+	vmovsd  5 * SIZE(BO1), %xmm5
+	vmovsd  7 * SIZE(BO1), %xmm7
+	vmovsd	%xmm1, 0*SIZE(BO)
+	vmovups	%xmm0, 1*SIZE(BO)
+	vmovsd	%xmm3, 3*SIZE(BO)
+	vmovups	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm5, 6*SIZE(BO)
+	vmovups	%xmm4, 7*SIZE(BO)
+	vmovsd	%xmm7, 9*SIZE(BO)
+	vmovups	%xmm6,10*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+
+	vmovups	0 * SIZE(BO2), %xmm0
+	vmovups	2 * SIZE(BO2), %xmm2
+	vmovups	4 * SIZE(BO2), %xmm4
+	vmovups	6 * SIZE(BO2), %xmm6
+	vmovsd  1 * SIZE(BO1), %xmm1
+	vmovsd  3 * SIZE(BO1), %xmm3
+	vmovsd  5 * SIZE(BO1), %xmm5
+	vmovsd  7 * SIZE(BO1), %xmm7
+	vmovsd	%xmm1, 0*SIZE(BO)
+	vmovups	%xmm0, 1*SIZE(BO)
+	vmovsd	%xmm3, 3*SIZE(BO)
+	vmovups	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm5, 6*SIZE(BO)
+	vmovups	%xmm4, 7*SIZE(BO)
+	vmovsd	%xmm7, 9*SIZE(BO)
+	vmovups	%xmm6,10*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+	decq	%rax
+	jnz	.L6_02c_1
+
+
+.L6_02c_2:
+
+	movq    K, %rax
+        andq    $7, %rax                // K % 8
+        jz      .L6_03c
+        ALIGN_4
+
+.L6_03b:
+
+	vmovsd	  1*SIZE(BO1), %xmm0
+	vmovups	  0*SIZE(BO2), %xmm1
+	vmovsd	%xmm0, 0*SIZE(BO)
+	vmovups	%xmm1, 1*SIZE(BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO2
+	addq	$3*SIZE,BO
+	decq	%rax
+	jnz	.L6_03b
+
+
+.L6_03c:
+
+	movq	BO2, B			// next offset of B
+
+.L6_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L6_20
+
+	ALIGN_4
+
+.L6_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	prefetcht0	(CO1)
+	prefetcht0	(CO1,LDC,1)
+	prefetcht0	(CO1,LDC,2)
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1,LDC,1)
+	prefetcht0	64(CO1,LDC,2)
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq $1, %rax			//  K / 8
+	je	.L6_16
+
+	ALIGN_5
+
+.L6_12:
+/*
+	prefetcht0	B_PR1(BO)
+	prefetcht0	B_PR1+64(BO)
+	prefetcht0	B_PR1+128(BO)
+*/
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+/*
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+*/
+	dec	%rax
+	jne	.L6_12
+
+.L6_16:
+        movq    K, %rax
+
+	andq	$1, %rax		# if (k & 1)
+	je .L6_19
+
+	ALIGN_4
+
+.L6_17:
+
+	KERNEL16x3_SUBN
+
+	dec	%rax
+	jne	.L6_17
+	ALIGN_4
+
+
+.L6_19:
+
+	SAVE16x3
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L6_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_10		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L6_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L6_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L6_20_6
+
+	ALIGN_4
+
+.L6_20_2:
+
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	dec	%rax
+	jne	.L6_20_2
+	ALIGN_4
+
+.L6_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_20_9
+
+
+	ALIGN_4
+
+.L6_20_7:
+
+	KERNEL8x3_SUBN
+
+	dec	%rax
+	jne	.L6_20_7
+	ALIGN_4
+
+
+.L6_20_9:
+
+	SAVE8x3
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L6_21pre:
+
+	testq	$4, M		
+	jz	.L6_30
+	ALIGN_4
+
+.L6_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L6_26
+
+	ALIGN_4
+
+.L6_22:
+
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	dec	%rax
+	jne	.L6_22
+	ALIGN_4
+
+.L6_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_29
+
+	ALIGN_4
+
+.L6_27:
+
+	KERNEL4x3_SUBN
+
+	dec %rax
+	jne	.L6_27
+	ALIGN_4
+
+
+.L6_29:
+
+	SAVE4x3
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L6_30:
+	testq	$2, M		
+	jz	.L6_40
+
+	ALIGN_4
+
+.L6_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L6_36
+	ALIGN_4
+
+.L6_32:
+
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	dec %rax
+	jne	.L6_32
+	ALIGN_4
+
+.L6_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_39
+
+	ALIGN_4
+
+.L6_37:
+
+	KERNEL2x3_SUBN
+
+	dec %rax
+	jne	.L6_37
+	ALIGN_4
+
+
+.L6_39:
+
+	SAVE2x3
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L6_40:
+	testq	$1, M		
+	jz	.L7_10		// to next 3 lines of N
+
+	ALIGN_4
+
+.L6_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L6_46
+
+	ALIGN_4
+
+.L6_42:
+
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+
+	dec %rax
+	jne	.L6_42
+	ALIGN_4
+
+.L6_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_49
+
+	ALIGN_4
+
+.L6_47:
+
+	KERNEL1x3_SUBN
+
+	dec	%rax
+	jne	.L6_47
+	ALIGN_4
+
+
+.L6_49:
+
+	SAVE1x3
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+/***************************************************************************************************************/
+
+.L7_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L7_20
+
+	ALIGN_4
+
+.L7_11:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $12 * SIZE, BO
+
+	prefetcht0	(CO1)
+	prefetcht0	(CO1,LDC,1)
+	prefetcht0	(CO1,LDC,2)
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1,LDC,1)
+	prefetcht0	64(CO1,LDC,2)
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq $3, %rax			// K / 8
+	je	.L7_16
+	ALIGN_5
+
+.L7_12:
+/*
+	prefetcht0	B_PR1(BO)
+	prefetcht0	B_PR1+64(BO)
+	prefetcht0	B_PR1+128(BO)
+*/
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	KERNEL16x3_SUBN
+	dec %rax
+	jne	.L7_12
+	ALIGN_4
+
+.L7_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_19
+
+	ALIGN_5
+
+.L7_17:
+
+	KERNEL16x3_SUBN
+
+	dec	%rax
+	jne	.L7_17
+
+
+.L7_19:
+
+	SAVE16x3
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L7_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L7_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L7_20_1:
+        leaq    BUFFER2, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L7_20_6
+
+	ALIGN_4
+
+.L7_20_2:
+
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+	KERNEL8x3_SUBN
+
+	dec %rax
+	jne	.L7_20_2
+	ALIGN_4
+
+.L7_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_20_9
+
+	ALIGN_4
+
+.L7_20_7:
+
+	KERNEL8x3_SUBN
+
+	dec %rax
+	jne	.L7_20_7
+	ALIGN_4
+
+.L7_20_9:
+
+	SAVE8x3
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L7_21pre:
+
+	testq	$4, M		
+	jz	.L7_30
+	ALIGN_4
+
+.L7_21:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L7_26
+
+	ALIGN_4
+
+.L7_22:
+
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+	KERNEL4x3_SUBN
+
+	dec %rax
+	jne	.L7_22
+	ALIGN_4
+
+.L7_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_29
+
+	ALIGN_4
+
+.L7_27:
+
+	KERNEL4x3_SUBN
+
+	dec %rax
+	jne	.L7_27
+	ALIGN_4
+
+
+.L7_29:
+
+	SAVE4x3
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L7_30:
+	testq	$2, M		
+	jz	.L7_40
+
+	ALIGN_4
+
+.L7_31:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L7_36
+
+	ALIGN_4
+
+.L7_32:
+
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+	KERNEL2x3_SUBN
+
+	dec %rax
+	jne	.L7_32
+	ALIGN_4
+
+.L7_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_39
+
+	ALIGN_4
+
+.L7_37:
+
+	KERNEL2x3_SUBN
+
+	dec %rax
+	jne	.L7_37
+	ALIGN_4
+
+
+.L7_39:
+
+	SAVE2x3
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L7_40:
+	testq	$1, M		
+	jz	.L7_60		// to next 3 lines of N
+
+	ALIGN_4
+
+.L7_41:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $12 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L7_46
+
+	ALIGN_4
+
+.L7_42:
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+	KERNEL1x3_SUBN
+
+	dec %rax
+	jne	.L7_42
+	ALIGN_4
+
+.L7_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_49
+
+	ALIGN_4
+
+.L7_47:
+
+	KERNEL1x3_SUBN
+
+	dec %rax
+	jne	.L7_47
+	ALIGN_4
+
+
+.L7_49:
+
+	SAVE1x3
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+.L7_60:
+
+	decq	J			// j --
+	jg	.L6_01
+
+
+.L2_0:
+	cmpq	$0, Nmod6		// N % 6 == 0
+	je	.L999
+
+/************************************************************************************************
+* Loop for Nmod6 / 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	sarq	$1, J			// j = j / 2
+	je	.L1_0
+	ALIGN_4
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	      (BO1), %xmm0
+	vmovups	2*SIZE(BO1), %xmm1
+	vmovups	4*SIZE(BO1), %xmm2
+	vmovups	6*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2*SIZE(BO)
+	vmovups	%xmm2, 4*SIZE(BO)
+	vmovups	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	je	.L2_16
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	je	.L2_20_6
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	je	.L2_26
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	je	.L2_36
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	je	.L2_46
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+.L2_60:
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	je	.L1_16
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	je	.L1_20_6
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	je	.L1_26
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	je	.L1_36
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	je	.L1_46
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	      (BO1), %xmm0
+	vmovups	2*SIZE(BO1), %xmm1
+	vmovups	4*SIZE(BO1), %xmm2
+	vmovups	6*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 2*SIZE(BO)
+	vmovups	%xmm2, 4*SIZE(BO)
+	vmovups	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	je	.L2_16
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x2_1
+	KERNEL16x2_2
+	KERNEL16x2_3
+	KERNEL16x2_4
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	je	.L2_20_6
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x2_1
+	KERNEL8x2_2
+	KERNEL8x2_3
+	KERNEL8x2_4
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	je	.L2_26
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x2_1
+	KERNEL4x2_2
+	KERNEL4x2_3
+	KERNEL4x2_4
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	je	.L2_36
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	KERNEL2x2_1
+	KERNEL2x2_2
+	KERNEL2x2_3
+	KERNEL2x2_4
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	je	.L2_46
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	KERNEL1x2_1
+	KERNEL1x2_2
+	KERNEL1x2_3
+	KERNEL1x2_4
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	je	.L1_16
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	KERNEL16x1_1
+	KERNEL16x1_2
+	KERNEL16x1_3
+	KERNEL16x1_4
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	je	.L1_20_6
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	KERNEL8x1_1
+	KERNEL8x1_2
+	KERNEL8x1_3
+	KERNEL8x1_4
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	je	.L1_26
+
+	prefetcht0      B_PR1(BO,BI,8)
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	KERNEL4x1_1
+	KERNEL4x1_2
+	KERNEL4x1_3
+	KERNEL4x1_4
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	je	.L1_36
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	KERNEL2x1_1
+	KERNEL2x1_2
+	KERNEL2x1_3
+	KERNEL2x1_4
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	je	.L1_46
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	KERNEL1x1_1
+	KERNEL1x1_2
+	KERNEL1x1_3
+	KERNEL1x1_4
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+
+
+#endif
diff --git a/kernel/x86_64/dgemm_kernel_4x4_haswell.S b/kernel/x86_64/dgemm_kernel_4x4_haswell.S
index 0a2ca7ae3..29501df8e 100644
--- a/kernel/x86_64/dgemm_kernel_4x4_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x4_haswell.S
@@ -1,3494 +1,3494 @@
-/*********************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************************/
-
-
-/*********************************************************************
-* 2013/10/28 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-
-*
-*
-* 2013/10/27 Saar
-* Parameter:
-*       DGEMM_DEFAULT_UNROLL_N  4
-*       DGEMM_DEFAULT_UNROLL_M  4
-*       DGEMM_DEFAULT_P         512
-*       DGEMM_DEFAULT_Q         256
-*	A_PR1			512
-*	B_PR1			512
-*
-*
-* Performance at 9216x9216x9216:
-*       1 thread:       53.3 GFLOPS (MKL:  54)
-*       2 threads:     100.0 GFLOPS (MKL:  97)
-*       3 threads:     147.0 GFLOPS (MKL: 133)
-*       4 threads:     184.0 GFLOPS (MKL: 170)
-*********************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-#define BO3	%rbp
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-#define L_BUFFER_SIZE 256*8*12+4096
-
-#else
-
-#define STACKSIZE 256
-#define L_BUFFER_SIZE 128*8*12+512
-
-#define OLD_A		40 + STACKSIZE(%rsp)
-#define OLD_B		48 + STACKSIZE(%rsp)
-#define OLD_C		56 + STACKSIZE(%rsp)
-#define OLD_LDC		64 + STACKSIZE(%rsp)
-#define OLD_OFFSET	72 + STACKSIZE(%rsp)
-
-#endif
-
-
-#define Ndiv12	 24(%rsp)
-#define Nmod12	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA	 48(%rsp)
-#define OFFSET	 56(%rsp)
-#define KK	 64(%rsp)
-#define KKK	 72(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 4(%rsp);\
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-#define	A_PR1	512
-#define	B_PR1	512
-
-/*******************************************************************************************
-* Macro definitions
-*******************************************************************************************/
-
-.macro INIT4x12
-
-	vxorpd		%ymm4 , %ymm4 , %ymm4
-	vxorpd		%ymm5 , %ymm5 , %ymm5
-	vxorpd		%ymm6 , %ymm6 , %ymm6
-	vxorpd		%ymm7 , %ymm7 , %ymm7
-	vxorpd		%ymm8 , %ymm8 , %ymm8
-	vxorpd		%ymm9 , %ymm9 , %ymm9
-	vxorpd		%ymm10, %ymm10, %ymm10
-	vxorpd		%ymm11, %ymm11, %ymm11
-	vxorpd		%ymm12, %ymm12, %ymm12
-	vxorpd		%ymm13, %ymm13, %ymm13
-	vxorpd		%ymm14, %ymm14, %ymm14
-	vxorpd		%ymm15, %ymm15, %ymm15
-
-.endm
-
-.macro KERNEL4x12_I
-	prefetcht0	A_PR1(AO)
-	vmovups		-12 * SIZE(BO), %ymm1
-	prefetcht0	B_PR1(BO)
-	vmovups 	-16 * SIZE(AO), %ymm0
-	prefetcht0	B_PR1+64(BO)
-	vmovups		 -8 * SIZE(BO), %ymm2
-	prefetcht0	B_PR1+128(BO)
-	vmovups		 -4 * SIZE(BO), %ymm3
-	vmulpd  	%ymm0 ,%ymm1  , %ymm4
-	prefetcht0	B_PR1+192(BO)
-	vmulpd  	%ymm0 ,%ymm2  , %ymm8
-	vmulpd  	%ymm0 ,%ymm3  , %ymm12
-	prefetcht0	B_PR1+256(BO)
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vmulpd  	%ymm0 ,%ymm1  , %ymm5
-	vmulpd  	%ymm0 ,%ymm2  , %ymm9
-	vmulpd  	%ymm0 ,%ymm3  , %ymm13
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-	vmulpd  	%ymm0 ,%ymm1  , %ymm6
-	vmulpd  	%ymm0 ,%ymm2  , %ymm10
-
-	addq		$ 12*SIZE, BO
-	vmulpd  	%ymm0 ,%ymm3  , %ymm14
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vmulpd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		-12 * SIZE(BO), %ymm1
-	vmulpd  	%ymm0 ,%ymm2  , %ymm11
-	vmovups		 -8 * SIZE(BO), %ymm2
-	vmulpd  	%ymm0 ,%ymm3  , %ymm15
-	vmovups		 -4 * SIZE(BO), %ymm3
-
-.endm
-
-.macro KERNEL4x12_M1
-	prefetcht0	A_PR1(AO)
-	vmovups 	-16 * SIZE(AO), %ymm0
-	prefetcht0	B_PR1(BO)
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	prefetcht0	B_PR1+64(BO)
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	prefetcht0	B_PR1+128(BO)
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		-12 * SIZE(BO), %ymm1
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	vmovups		 -8 * SIZE(BO), %ymm2
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
-	vmovups		 -4 * SIZE(BO), %ymm3
-
-.endm
-
-.macro KERNEL4x12_M2
-	vmovups 	-12 * SIZE(AO), %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-
-	addq		$ 8*SIZE, AO
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		  0 * SIZE(BO), %ymm1
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	vmovups		  4 * SIZE(BO), %ymm2
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
-	vmovups		  8 * SIZE(BO), %ymm3
-	addq		$ 24*SIZE, BO
-.endm
-
-
-.macro KERNEL4x12_E
-	vmovups 	-12 * SIZE(AO), %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-
-	addq		$ 8*SIZE, AO
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
-	addq		$ 12*SIZE, BO
-.endm
-
-.macro KERNEL4x12_SUB
-	vmovups		-12 * SIZE(BO), %ymm1
-	vmovups 	-16 * SIZE(AO), %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vmovups		 -8 * SIZE(BO), %ymm2
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	vmovups		 -4 * SIZE(BO), %ymm3
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-	addq		$ 12*SIZE, BO
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-	addq		$ 4*SIZE, AO
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
-
-.endm
-
-
-.macro SAVE4x12
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-	vmulpd	%ymm0 , %ymm6 , %ymm6
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-
-	vmulpd	%ymm0 , %ymm8 , %ymm8
-	vmulpd	%ymm0 , %ymm9 , %ymm9
-	vmulpd	%ymm0 , %ymm10, %ymm10
-	vmulpd	%ymm0 , %ymm11, %ymm11
-
-	vmulpd	%ymm0 , %ymm12, %ymm12
-	vmulpd	%ymm0 , %ymm13, %ymm13
-	vmulpd	%ymm0 , %ymm14, %ymm14
-	vmulpd	%ymm0 , %ymm15, %ymm15
-
-	vpermpd $ 0xb1 , %ymm5, %ymm5
-	vpermpd $ 0xb1 , %ymm7, %ymm7
-
-	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
-	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
-	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
-	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
-
-	vpermpd $ 0x1b , %ymm2, %ymm2
-	vpermpd $ 0x1b , %ymm3, %ymm3
-	vpermpd $ 0xb1 , %ymm2, %ymm2
-	vpermpd $ 0xb1 , %ymm3, %ymm3
-
-	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
-	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
-	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
-	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
-        leaq    (CO1, LDC, 2), %rax     
-	
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1), %ymm4, %ymm4
-	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
-	vaddpd 	               (%rax), %ymm6, %ymm6
-	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 ,  	(CO1, LDC)
-	vmovups	%ymm6 ,  	(%rax)
-	vmovups	%ymm7 ,  	(%rax, LDC)
-
-	prefetcht0	32(CO1)
-	prefetcht0	32(CO1,LDC)
-	prefetcht0	32(%rax)
-	prefetcht0	32(%rax,LDC)
-
-	vpermpd $ 0xb1 , %ymm9 , %ymm9
-	vpermpd $ 0xb1 , %ymm11, %ymm11
-
-	vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
-	vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
-	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
-	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
-
-	vpermpd $ 0x1b , %ymm2, %ymm2
-	vpermpd $ 0x1b , %ymm3, %ymm3
-	vpermpd $ 0xb1 , %ymm2, %ymm2
-	vpermpd $ 0xb1 , %ymm3, %ymm3
-
-	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
-	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
-	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
-	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
-
-	leaq	(%rax, LDC, 2), %rax
-	leaq	(%rax, LDC, 2), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (%rax), %ymm4, %ymm4
-	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
-	vaddpd 	                (%rbp), %ymm6, %ymm6
-	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(%rax)
-	vmovups	%ymm5 ,  	(%rax, LDC)
-	vmovups	%ymm6 ,  	(%rbp)
-	vmovups	%ymm7 ,  	(%rbp, LDC)
-
-	prefetcht0	32(%rax)
-	prefetcht0	32(%rax,LDC)
-	prefetcht0	32(%rbp)
-	prefetcht0	32(%rbp,LDC)
-
-	vpermpd $ 0xb1 , %ymm13, %ymm13
-	vpermpd $ 0xb1 , %ymm15, %ymm15
-
-	vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
-	vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
-	vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
-	vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
-
-	vpermpd $ 0x1b , %ymm2, %ymm2
-	vpermpd $ 0x1b , %ymm3, %ymm3
-	vpermpd $ 0xb1 , %ymm2, %ymm2
-	vpermpd $ 0xb1 , %ymm3, %ymm3
-
-	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
-	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
-	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
-	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
-
-	leaq	(%rax, LDC, 4), %rax
-	leaq	(%rbp, LDC, 4), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (%rax), %ymm4, %ymm4
-	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
-	vaddpd 	                (%rbp), %ymm6, %ymm6
-	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(%rax)
-	vmovups	%ymm5 ,  	(%rax, LDC)
-	vmovups	%ymm6 ,  	(%rbp)
-	vmovups	%ymm7 ,  	(%rbp, LDC)
-
-	prefetcht0	32(%rax)
-	prefetcht0	32(%rax,LDC)
-	prefetcht0	32(%rbp)
-	prefetcht0	32(%rbp,LDC)
-
-	addq	$ 4*SIZE, CO1
-.endm
-
-/******************************************************************************************/
-
-.macro INIT2x12
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-	vxorpd		%xmm8 , %xmm8 , %xmm8
-	vxorpd		%xmm9 , %xmm9 , %xmm9
-	vxorpd		%xmm10, %xmm10, %xmm10
-	vxorpd		%xmm11, %xmm11, %xmm11
-	vxorpd		%xmm12, %xmm12, %xmm12
-	vxorpd		%xmm13, %xmm13, %xmm13
-	vxorpd		%xmm14, %xmm14, %xmm14
-	vxorpd		%xmm15, %xmm15, %xmm15
-
-.endm
-
-.macro KERNEL2x12_SUB
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vmovddup	-12 * SIZE(BO), %xmm1
-	vmovddup	-11 * SIZE(BO), %xmm2
-	vmovddup	-10 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
-	vmovddup	 -9 * SIZE(BO), %xmm1
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
-	vmovddup	 -8 * SIZE(BO), %xmm2
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
-	vmovddup	 -7 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm7
-	vmovddup	 -6 * SIZE(BO), %xmm1
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm8
-	vmovddup	 -5 * SIZE(BO), %xmm2
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm9
-	vmovddup	 -4 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm10
-	vmovddup	 -3 * SIZE(BO), %xmm1
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm11
-	vmovddup	 -2 * SIZE(BO), %xmm2
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm12
-	vmovddup	 -1 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm13
-	addq		$ 12*SIZE, BO
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm14
-	addq		$ 2*SIZE, AO
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm15
-
-.endm
-
-.macro SAVE2x12
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-	vmulpd	%xmm0 , %xmm5 , %xmm5
-	vmulpd	%xmm0 , %xmm6 , %xmm6
-	vmulpd	%xmm0 , %xmm7 , %xmm7
-
-	vmulpd	%xmm0 , %xmm8 , %xmm8
-	vmulpd	%xmm0 , %xmm9 , %xmm9
-	vmulpd	%xmm0 , %xmm10, %xmm10
-	vmulpd	%xmm0 , %xmm11, %xmm11
-
-	vmulpd	%xmm0 , %xmm12, %xmm12
-	vmulpd	%xmm0 , %xmm13, %xmm13
-	vmulpd	%xmm0 , %xmm14, %xmm14
-	vmulpd	%xmm0 , %xmm15, %xmm15
-
-
-        leaq    (CO1, LDC, 2), %rax     
-	
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1), %xmm4, %xmm4
-	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
-	vaddpd 	               (%rax), %xmm6, %xmm6
-	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(%rax)
-	vmovups	%xmm7 ,  	(%rax, LDC)
-
-
-	leaq	(%rax, LDC, 2), %rax
-	leaq	(%rax, LDC, 2), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (%rax), %xmm8 , %xmm4
-	vaddpd 	           (%rax, LDC), %xmm9 , %xmm5
-	vaddpd 	                (%rbp), %xmm10, %xmm6
-	vaddpd 	           (%rbp, LDC), %xmm11, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(%rax)
-	vmovups	%xmm5 ,  	(%rax, LDC)
-	vmovups	%xmm6 ,  	(%rbp)
-	vmovups	%xmm7 ,  	(%rbp, LDC)
-
-
-	leaq	(%rax, LDC, 4), %rax
-	leaq	(%rbp, LDC, 4), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (%rax), %xmm12, %xmm4
-	vaddpd 	           (%rax, LDC), %xmm13, %xmm5
-	vaddpd 	                (%rbp), %xmm14, %xmm6
-	vaddpd 	           (%rbp, LDC), %xmm15, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(%rax)
-	vmovups	%xmm5 ,  	(%rax, LDC)
-	vmovups	%xmm6 ,  	(%rbp)
-	vmovups	%xmm7 ,  	(%rbp, LDC)
-
-	addq	$ 2*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-
-.macro INIT1x12
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-	vxorpd		%xmm8 , %xmm8 , %xmm8
-	vxorpd		%xmm9 , %xmm9 , %xmm9
-	vxorpd		%xmm10, %xmm10, %xmm10
-	vxorpd		%xmm11, %xmm11, %xmm11
-	vxorpd		%xmm12, %xmm12, %xmm12
-	vxorpd		%xmm13, %xmm13, %xmm13
-	vxorpd		%xmm14, %xmm14, %xmm14
-	vxorpd		%xmm15, %xmm15, %xmm15
-
-.endm
-
-.macro KERNEL1x12_SUB
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd	-11 * SIZE(BO), %xmm2
-	vmovsd	-10 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
-	vmovsd	 -9 * SIZE(BO), %xmm1
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
-	vmovsd	 -8 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
-	vmovsd	 -7 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm7
-	vmovsd	 -6 * SIZE(BO), %xmm1
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm8
-	vmovsd	 -5 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm9
-	vmovsd	 -4 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm10
-	vmovsd	 -3 * SIZE(BO), %xmm1
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm11
-	vmovsd	 -2 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm12
-	vmovsd	 -1 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm13
-	addq		$ 12*SIZE, BO
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm14
-	addq		$ 1*SIZE, AO
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm15
-
-.endm
-
-.macro SAVE1x12
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-	vmulsd	%xmm0 , %xmm6 , %xmm6
-	vmulsd	%xmm0 , %xmm7 , %xmm7
-
-	vmulsd	%xmm0 , %xmm8 , %xmm8
-	vmulsd	%xmm0 , %xmm9 , %xmm9
-	vmulsd	%xmm0 , %xmm10, %xmm10
-	vmulsd	%xmm0 , %xmm11, %xmm11
-
-	vmulsd	%xmm0 , %xmm12, %xmm12
-	vmulsd	%xmm0 , %xmm13, %xmm13
-	vmulsd	%xmm0 , %xmm14, %xmm14
-	vmulsd	%xmm0 , %xmm15, %xmm15
-
-
-        leaq    (CO1, LDC, 2), %rax     
-	
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (CO1), %xmm4, %xmm4
-	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
-	vaddsd 	               (%rax), %xmm6, %xmm6
-	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm6 ,  	(%rax)
-	vmovsd	%xmm7 ,  	(%rax, LDC)
-
-
-	leaq	(%rax, LDC, 2), %rax
-	leaq	(%rax, LDC, 2), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (%rax), %xmm8 , %xmm4
-	vaddsd 	           (%rax, LDC), %xmm9 , %xmm5
-	vaddsd 	                (%rbp), %xmm10, %xmm6
-	vaddsd 	           (%rbp, LDC), %xmm11, %xmm7
-
-#endif
-
-	vmovsd	%xmm4 ,  	(%rax)
-	vmovsd	%xmm5 ,  	(%rax, LDC)
-	vmovsd	%xmm6 ,  	(%rbp)
-	vmovsd	%xmm7 ,  	(%rbp, LDC)
-
-
-	leaq	(%rax, LDC, 4), %rax
-	leaq	(%rbp, LDC, 4), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (%rax), %xmm12, %xmm4
-	vaddsd 	           (%rax, LDC), %xmm13, %xmm5
-	vaddsd 	                (%rbp), %xmm14, %xmm6
-	vaddsd 	           (%rbp, LDC), %xmm15, %xmm7
-
-#endif
-
-	vmovsd	%xmm4 ,  	(%rax)
-	vmovsd	%xmm5 ,  	(%rax, LDC)
-	vmovsd	%xmm6 ,  	(%rbp)
-	vmovsd	%xmm7 ,  	(%rbp, LDC)
-
-	addq	$ 1*SIZE, CO1
-.endm
-
-
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT4x4
-
-	vxorpd		%ymm4 , %ymm4 , %ymm4
-	vxorpd		%ymm5 , %ymm5 , %ymm5
-	vxorpd		%ymm6 , %ymm6 , %ymm6
-	vxorpd		%ymm7 , %ymm7 , %ymm7
-
-.endm
-
-.macro KERNEL4x4_I
-	prefetcht0	A_PR1(AO)
-	vmovups		-12 * SIZE(BO), %ymm1
-	vmovups 	-16 * SIZE(AO), %ymm0
-	vmulpd  	%ymm0 ,%ymm1  , %ymm4
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vmulpd  	%ymm0 ,%ymm1  , %ymm5
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-	vmulpd  	%ymm0 ,%ymm1  , %ymm6
-
-	addq		$ 4*SIZE, BO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vmulpd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		-12 * SIZE(BO), %ymm1
-
-.endm
-
-.macro KERNEL4x4_M1
-	prefetcht0	A_PR1(AO)
-	vmovups 	-16 * SIZE(AO), %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		-12 * SIZE(BO), %ymm1
-
-.endm
-
-.macro KERNEL4x4_M2
-	vmovups 	-12 * SIZE(AO), %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-
-	addq		$ 8*SIZE, AO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		 -8 * SIZE(BO), %ymm1
-	addq		$ 8*SIZE, BO
-.endm
-
-
-.macro KERNEL4x4_E
-	vmovups 	-12 * SIZE(AO), %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-
-	addq		$ 8*SIZE, AO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	addq		$ 4*SIZE, BO
-.endm
-
-.macro KERNEL4x4_SUB
-	vmovups		-12 * SIZE(BO), %ymm1
-	vmovups 	-16 * SIZE(AO), %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	addq		$ 4*SIZE, BO
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	addq		$ 4*SIZE, AO
-	vpermpd		$ 0xb1, %ymm0  , %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-
-.endm
-
-.macro SAVE4x4
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-	vmulpd	%ymm0 , %ymm6 , %ymm6
-
-	vpermpd $ 0xb1 , %ymm5, %ymm5
-	vpermpd $ 0xb1 , %ymm7, %ymm7
-
-	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
-	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
-	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
-	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
-
-	vpermpd $ 0x1b , %ymm2, %ymm2
-	vpermpd $ 0x1b , %ymm3, %ymm3
-	vpermpd $ 0xb1 , %ymm2, %ymm2
-	vpermpd $ 0xb1 , %ymm3, %ymm3
-
-	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
-	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
-	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
-	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-
-        leaq    (CO1, LDC, 2), %rax     
-	
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1), %ymm4, %ymm4
-	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
-	vaddpd 	               (%rax), %ymm6, %ymm6
-	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 ,  	(CO1, LDC)
-	vmovups	%ymm6 ,  	(%rax)
-	vmovups	%ymm7 ,  	(%rax, LDC)
-
-	addq	$ 4*SIZE, CO1
-.endm
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT2x4
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-
-.endm
-
-
-.macro KERNEL2x4_SUB
-	vmovddup	-12 * SIZE(BO), %xmm1
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vmovddup	-11 * SIZE(BO), %xmm2
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
-	vmovddup	-10 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
-	vmovddup	 -9 * SIZE(BO), %xmm8
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
-	addq		$ 4*SIZE, BO
-	vfmadd231pd  	%xmm0 ,%xmm8  , %xmm7
-	addq		$ 2*SIZE, AO
-
-.endm
-
-
-.macro SAVE2x4
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-	vmulpd	%xmm0 , %xmm5 , %xmm5
-	vmulpd	%xmm0 , %xmm6 , %xmm6
-	vmulpd	%xmm0 , %xmm7 , %xmm7
-
-        leaq    (CO1, LDC, 2), %rax     
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1), %xmm4, %xmm4
-	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
-	vaddpd 	               (%rax), %xmm6, %xmm6
-	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(%rax)
-	vmovups	%xmm7 ,  	(%rax, LDC)
-
-	addq	$ 2*SIZE, CO1
-.endm
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT1x4
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-
-.endm
-
-
-.macro KERNEL1x4_SUB
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	vmovsd	-11 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
-	vmovsd	-10 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
-	vmovsd	 -9 * SIZE(BO), %xmm8
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
-	addq		$ 4*SIZE, BO
-	vfmadd231sd  	%xmm0 ,%xmm8  , %xmm7
-	addq		$ 1*SIZE, AO
-
-.endm
-
-
-.macro SAVE1x4
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-	vmulsd	%xmm0 , %xmm6 , %xmm6
-	vmulsd	%xmm0 , %xmm7 , %xmm7
-
-        leaq    (CO1, LDC, 2), %rax     
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (CO1), %xmm4, %xmm4
-	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
-	vaddsd 	               (%rax), %xmm6, %xmm6
-	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm6 ,  	(%rax)
-	vmovsd	%xmm7 ,  	(%rax, LDC)
-
-	addq	$ 1*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT4x2
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-
-.endm
-
-
-.macro KERNEL4x2_SUB
-	vmovddup	-12 * SIZE(BO), %xmm2
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vmovups 	-14 * SIZE(AO), %xmm1
-	vmovddup	-11 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
-	vfmadd231pd  	%xmm1 ,%xmm2  , %xmm5
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
-	vfmadd231pd  	%xmm1 ,%xmm3  , %xmm7
-	addq		$ 2*SIZE, BO
-	addq		$ 4*SIZE, AO
-
-.endm
-
-
-.macro SAVE4x2
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-	vmulpd	%xmm0 , %xmm5 , %xmm5
-	vmulpd	%xmm0 , %xmm6 , %xmm6
-	vmulpd	%xmm0 , %xmm7 , %xmm7
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1)     , %xmm4, %xmm4
-	vaddpd 	        2 * SIZE(CO1)     , %xmm5, %xmm5
-	vaddpd 	                (CO1, LDC), %xmm6, %xmm6
-	vaddpd 	        2 * SIZE(CO1, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 , 2 * SIZE(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-	vmovups	%xmm7 , 2 * SIZE(CO1, LDC)
-
-	addq	$ 4*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT2x2
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-
-.endm
-
-
-.macro KERNEL2x2_SUB
-	vmovddup	-12 * SIZE(BO), %xmm2
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vmovddup	-11 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
-	addq		$ 2*SIZE, BO
-	addq		$ 2*SIZE, AO
-
-.endm
-
-
-.macro SAVE2x2
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-	vmulpd	%xmm0 , %xmm6 , %xmm6
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1)     , %xmm4, %xmm4
-	vaddpd 	                (CO1, LDC), %xmm6, %xmm6
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-
-	addq	$ 2*SIZE, CO1
-.endm
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT1x2
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-
-.endm
-
-
-.macro KERNEL1x2_SUB
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	vmovsd	-11 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
-	addq		$ 2*SIZE, BO
-	addq		$ 1*SIZE, AO
-
-.endm
-
-
-.macro SAVE1x2
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (CO1), %xmm4, %xmm4
-	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-
-	addq	$ 1*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT4x1
-
-	vxorpd		%ymm4 , %ymm4 , %ymm4
-	vxorpd		%ymm5 , %ymm5 , %ymm5
-	vxorpd		%ymm6 , %ymm6 , %ymm6
-	vxorpd		%ymm7 , %ymm7 , %ymm7
-
-.endm
-
-
-.macro KERNEL4x1
-
-	vbroadcastsd	-12 * SIZE(BO), %ymm0
-	vbroadcastsd	-11 * SIZE(BO), %ymm1
-	vbroadcastsd	-10 * SIZE(BO), %ymm2
-	vbroadcastsd	-9  * SIZE(BO), %ymm3
-
-	vfmadd231pd  	-16 * SIZE(AO) ,%ymm0  , %ymm4
-	vfmadd231pd  	-12 * SIZE(AO) ,%ymm1  , %ymm5
-
-	vbroadcastsd	-8  * SIZE(BO), %ymm0
-	vbroadcastsd	-7  * SIZE(BO), %ymm1
-
-	vfmadd231pd  	-8  * SIZE(AO) ,%ymm2  , %ymm6
-	vfmadd231pd  	-4  * SIZE(AO) ,%ymm3  , %ymm7
-
-	vbroadcastsd	-6  * SIZE(BO), %ymm2
-	vbroadcastsd	-5  * SIZE(BO), %ymm3
-
-	vfmadd231pd  	 0  * SIZE(AO) ,%ymm0  , %ymm4
-	vfmadd231pd  	 4  * SIZE(AO) ,%ymm1  , %ymm5
-	vfmadd231pd  	 8  * SIZE(AO) ,%ymm2  , %ymm6
-	vfmadd231pd  	 12 * SIZE(AO) ,%ymm3  , %ymm7
-
-	addq		$ 8 *SIZE, BO
-	addq		$ 32*SIZE, AO
-
-.endm
-
-
-.macro KERNEL4x1_SUB
-	vbroadcastsd	-12 * SIZE(BO), %ymm2
-	vmovups 	-16 * SIZE(AO), %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm4
-	addq		$ 1*SIZE, BO
-	addq		$ 4*SIZE, AO
-
-.endm
-
-
-.macro SAVE4x1
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vaddpd	%ymm4,%ymm5, %ymm4 
-	vaddpd	%ymm6,%ymm7, %ymm6 
-	vaddpd	%ymm4,%ymm6, %ymm4 
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1)     , %ymm4, %ymm4
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-
-	addq	$ 4*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT2x1
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-
-.endm
-
-
-.macro KERNEL2x1_SUB
-	vmovddup	-12 * SIZE(BO), %xmm2
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
-	addq		$ 1*SIZE, BO
-	addq		$ 2*SIZE, AO
-
-.endm
-
-
-.macro SAVE2x1
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1)     , %xmm4, %xmm4
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-
-	addq	$ 2*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT1x1
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-
-.endm
-
-
-.macro KERNEL1x1_SUB
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
-	addq		$ 1*SIZE, BO
-	addq		$ 1*SIZE, AO
-
-.endm
-
-
-.macro SAVE1x1
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (CO1), %xmm4, %xmm4
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-
-	addq	$ 1*SIZE, CO1
-.endm
-
-
-/*******************************************************************************************/
-
-#if !defined(TRMMKERNEL)
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-
-	vmovups	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$ 0, OLD_M
-	je	.L999
-
-	cmpq	$ 0, OLD_N
-	je	.L999
-
-	cmpq	$ 0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $12,  %rdi
-        divq    %rdi                    //    N / 12
-        movq    %rax, Ndiv12             //    N / 12
-        movq    %rdx, Nmod12             //    N % 12
-
-
-	movq	Ndiv12,  J
-	cmpq	$ 0, J
-	je	.L4_0
-	ALIGN_4
-
-.L12_01:
-        // copy to sub buffer
-        movq    K, %rax
-        salq    $2,%rax                 // K * 4 ; read 2 values
-        movq    B, BO1
-        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
-        leaq    (BO2,%rax, SIZE), BO3     // next offset to BO2
-
-
-        leaq    BUFFER1, BO             // first buffer to BO
-        movq    K, %rax
-	sarq	$1 , %rax		// K / 2
-	jz	.L12_01a_2
-        ALIGN_4
-
-.L12_01a_1:
-
-        prefetcht0 512(BO1)
-        prefetcht0 512(BO2)
-        prefetcht0 512(BO3)
-        prefetchw  512(BO)
-
-
-	vmovups	0 * SIZE(BO1), %ymm1
-	vmovups	4 * SIZE(BO1), %ymm5
-	vmovups	0 * SIZE(BO2), %ymm2
-	vmovups	4 * SIZE(BO2), %ymm6
-	vmovups	0 * SIZE(BO3), %ymm3
-	vmovups	4 * SIZE(BO3), %ymm7
-
-	vmovups	%ymm1,  0 * SIZE(BO)
-	vmovups	%ymm2,  4 * SIZE(BO)
-	vmovups	%ymm3,  8 * SIZE(BO)
-
-	vmovups	%ymm5, 12 * SIZE(BO)
-	vmovups	%ymm6, 16 * SIZE(BO)
-	vmovups	%ymm7, 20 * SIZE(BO)
-
-	addq	$ 8 * SIZE ,BO1
-	addq	$ 8 * SIZE ,BO2
-	addq	$ 8 * SIZE ,BO3
-	addq    $ 24 *SIZE ,BO
-
-	decq	%rax
-	jnz	.L12_01a_1
-
-
-
-.L12_01a_2:
-
-	movq    K, %rax
-        andq    $1, %rax                // K % 2
-        jz      .L12_03c
-        ALIGN_4
-
-
-.L12_02b:
-
-	vmovups	0 * SIZE(BO1), %ymm1
-	vmovups	0 * SIZE(BO2), %ymm2
-	vmovups	0 * SIZE(BO3), %ymm3
-	vmovups	%ymm1, 0 * SIZE(BO)
-	vmovups	%ymm2, 4 * SIZE(BO)
-	vmovups	%ymm3, 8 * SIZE(BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO2
-	addq	$ 4*SIZE,BO3
-	addq	$ 12*SIZE,BO
-	decq	%rax
-	jnz	.L12_02b
-
-.L12_03c:
-
-	movq	BO3, B			// next offset of B
-
-.L12_10:
-	movq	C, CO1
-	leaq	(C, LDC, 8), C		 
-	leaq	(C, LDC, 4), C		// c += 12 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L12_20
-
-	ALIGN_4
-
-.L12_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-        movq    K, %rax
-
-	sarq $3, %rax			//  K / 8
-	cmpq $2, %rax
-
-	jl	.L12_13
-
-
-	KERNEL4x12_I
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	subq $2, %rax
-	je	.L12_12a
-
-	ALIGN_5
-.L12_12:
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	dec	%rax
-	jne	.L12_12
-
-.L12_12a:
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_E
-
-	jmp .L12_16
-
-
-.L12_13:
-
-	test $1, %rax
-	jz .L12_14
-
-	KERNEL4x12_I
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_E
-
-	jmp .L12_16
-
-
-.L12_14:
-
-	INIT4x12
-
-
-.L12_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L12_19
-
-	ALIGN_4
-
-.L12_17:
-
-	KERNEL4x12_SUB
-
-	dec	%rax
-	jne	.L12_17
-	ALIGN_4
-
-
-.L12_19:
-
-	SAVE4x12
-
-	decq	I			# i --
-	jne	.L12_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L12_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L12_100			// to next 16 lines of N
-
-
-.L12_30:
-	testq	$2, M		
-	jz	.L12_40
-
-	ALIGN_4
-
-.L12_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT2x12
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L12_36
-	ALIGN_4
-
-.L12_32:
-
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-
-	dec %rax
-	jne	.L12_32
-	ALIGN_4
-
-.L12_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L12_39
-
-	ALIGN_4
-
-.L12_37:
-
-	KERNEL2x12_SUB
-
-	dec %rax
-	jne	.L12_37
-	ALIGN_4
-
-
-.L12_39:
-
-	SAVE2x12
-
-	ALIGN_4
-
-.L12_40:
-	testq	$1, M		
-	jz	.L12_100		// to next 3 lines of N
-
-	ALIGN_4
-
-.L12_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT1x12
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L12_46
-
-	ALIGN_4
-
-.L12_42:
-
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-
-
-	dec %rax
-	jne	.L12_42
-	ALIGN_4
-
-.L12_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L12_49
-
-	ALIGN_4
-
-.L12_47:
-
-	KERNEL1x12_SUB
-
-	dec	%rax
-	jne	.L12_47
-	ALIGN_4
-
-
-.L12_49:
-
-	SAVE1x12
-
-	ALIGN_4
-	
-.L12_100:
-
-	decq	J			// j --
-	jg	.L12_01
-
-
-.L4_0:
-
-	cmpq	$ 0, Nmod12		// N % 12 == 0
-	je	.L999
-
-	movq	Nmod12, J		
-	sarq	$2, J			// j = j / 4
-	je	.L2_0
-
-.L4_10:
-	movq	C, CO1
-	leaq	(C, LDC, 4), C		// c += 4 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L4_20
-
-	ALIGN_4
-
-.L4_11:
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-
-        movq    K, %rax
-
-	sarq	$3, %rax			//  K / 8
-	cmpq    $2, %rax
-	jl	.L4_13
-
-
-	KERNEL4x4_I
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	subq $2, %rax
-	je	.L4_12a
-
-	ALIGN_5
-
-.L4_12:
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	dec	%rax
-	jne	.L4_12
-
-.L4_12a:
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_E
-
-	jmp .L4_16
-
-
-.L4_13:
-
-	test $1, %rax
-	jz .L4_14
-
-	KERNEL4x4_I
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_E
-
-	jmp .L4_16
-
-
-.L4_14:
-
-	INIT4x4
-
-
-.L4_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_19
-
-	ALIGN_4
-
-.L4_17:
-
-	KERNEL4x4_SUB
-
-	dec	%rax
-	jne	.L4_17
-	ALIGN_4
-
-
-.L4_19:
-
-	SAVE4x4
-
-	decq	I			# i --
-	jg	.L4_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L4_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L4_100			// to next 16 lines of N
-
-
-.L4_30:
-	testq	$2, M		
-	jz	.L4_40
-
-	ALIGN_4
-
-.L4_31:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT2x4
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L4_36
-	ALIGN_4
-
-.L4_32:
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	dec %rax
-	jne	.L4_32
-	ALIGN_4
-
-.L4_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_39
-
-	ALIGN_4
-
-.L4_37:
-
-	KERNEL2x4_SUB
-
-	dec %rax
-	jne	.L4_37
-
-
-.L4_39:
-
-	SAVE2x4
-
-.L4_40:
-	testq	$1, M		
-	jz	.L4_100		// to next 3 lines of N
-
-	ALIGN_4
-
-.L4_41:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT1x4
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L4_46
-
-	ALIGN_4
-
-.L4_42:
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	dec %rax
-	jne	.L4_42
-	ALIGN_4
-
-.L4_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_49
-
-	ALIGN_4
-
-.L4_47:
-
-	KERNEL1x4_SUB
-
-	dec	%rax
-	jne	.L4_47
-	ALIGN_4
-
-
-.L4_49:
-
-	SAVE1x4
-
-	ALIGN_4
-	
-.L4_100:
-
-	movq	K, %rax
-	salq	$2, %rax		// * 4
-	leaq	(B , %rax, SIZE), B
-	decq	J			// j --
-	jg	.L4_10
-
-
-
-
-/***************************************************************************************************************/
-
-.L2_0:
-
-	movq	Nmod12, J		
-	testq	$2, J
-	je	.L1_0
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-
-	INIT4x2
-
-        movq    K, %rax
-	sarq $3, %rax			//  K / 8
-
-	je	.L2_16
-
-	ALIGN_5
-
-.L2_12:
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	dec	%rax
-	jne	.L2_12
-
-
-.L2_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL4x2_SUB
-
-	dec	%rax
-	jne	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE4x2
-
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L2_100			// to next 16 lines of N
-
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT2x2
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L2_36
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	dec %rax
-	jne	.L2_32
-
-.L2_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	dec %rax
-	jne	.L2_37
-
-
-.L2_39:
-
-	SAVE2x2
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_100		// to next 3 lines of N
-
-.L2_41:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT1x2
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L2_46
-
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	dec %rax
-	jne	.L2_42
-
-.L2_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	dec	%rax
-	jne	.L2_47
-
-.L2_49:
-
-	SAVE1x2
-
-.L2_100:
-
-	movq	K, %rax
-	salq	$1, %rax		// * 2
-	leaq	(B , %rax, SIZE), B
-
-/***************************************************************************************************************/
-
-.L1_0:
-
-	movq	Nmod12, J		
-	testq	$1, J
-	je	.L999
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-
-	INIT4x1
-
-        movq    K, %rax
-
-	sarq	$3, %rax			//  K / 8
-	je	.L1_16
-
-	ALIGN_5
-
-.L1_12:
-
-	KERNEL4x1
-
-	dec	%rax
-	jne	.L1_12
-
-
-.L1_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL4x1_SUB
-
-	dec	%rax
-	jne	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE4x1
-
-	decq	I			# i --
-	jg	.L1_11
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L1_100	
-
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT2x1
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L1_36
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-
-	dec %rax
-	jne	.L1_32
-
-.L1_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	dec %rax
-	jne	.L1_37
-
-.L1_39:
-
-	SAVE2x1
-
-.L1_40:
-	testq	$1, M		
-	jz	.L1_100		// to next 3 lines of N
-
-
-.L1_41:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT1x1
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L1_46
-
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	dec %rax
-	jne	.L1_42
-
-.L1_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	dec	%rax
-	jne	.L1_47
-
-
-.L1_49:
-
-	SAVE1x1
-
-.L1_100:
-
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#else
-/*************************************************************************************
-* TRMM Kernel
-*************************************************************************************/
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovups	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	vmovsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$ 0, OLD_M
-	je	.L999
-
-	cmpq	$ 0, OLD_N
-	je	.L999
-
-	cmpq	$ 0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $4,  %rdi
-        divq    %rdi                     //    N / 4
-        movq    %rax, Ndiv12             //    N / 4
-        movq    %rdx, Nmod12             //    N % 4
-
-#ifdef TRMMKERNEL
-        vmovsd  %xmm12, OFFSET
-        vmovsd  %xmm12, KK
-#ifndef LEFT
-        negq    KK
-#endif  
-#endif
-
-
-
-	movq	Ndiv12,  J
-	cmpq	$ 0, J
-	je	.L2_0
-	ALIGN_4
-
-.L4_10:
-	movq	C, CO1
-	leaq	(C, LDC, 4), C		// c += 4 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L4_20
-
-	ALIGN_4
-
-.L4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,4), BO		// add number of values in B
-	leaq	(AO,%rax,4), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	sarq	$3, %rax			//  K / 8
-	cmpq    $2, %rax
-	jl	.L4_13
-
-
-	KERNEL4x4_I
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	subq $2, %rax
-	je	.L4_12a
-
-	ALIGN_5
-
-.L4_12:
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	dec	%rax
-	jne	.L4_12
-
-.L4_12a:
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_E
-
-	jmp .L4_16
-
-
-.L4_13:
-
-	test $1, %rax
-	jz .L4_14
-
-	KERNEL4x4_I
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_E
-
-	jmp .L4_16
-
-
-.L4_14:
-
-	INIT4x4
-
-
-.L4_16:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_19
-
-	ALIGN_4
-
-.L4_17:
-
-	KERNEL4x4_SUB
-
-	dec	%rax
-	jne	.L4_17
-	ALIGN_4
-
-
-.L4_19:
-
-	SAVE4x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 4), BO		// number of values in B
-        leaq    (AO, %rax, 4), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK				// number of values in A
-#endif
-
-	decq	I			# i --
-	jg	.L4_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L4_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L4_100			// to next 16 lines of N
-
-
-.L4_30:
-	testq	$2, M		
-	jz	.L4_40
-
-	ALIGN_4
-
-.L4_31:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,4), BO		// add number of values in B
-	leaq	(AO,%rax,2), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT2x4
-
-	sarq	$3, %rax
-	je	.L4_36
-	ALIGN_4
-
-.L4_32:
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	dec %rax
-	jne	.L4_32
-	ALIGN_4
-
-.L4_36:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_39
-
-	ALIGN_4
-
-.L4_37:
-
-	KERNEL2x4_SUB
-
-	dec %rax
-	jne	.L4_37
-
-
-.L4_39:
-
-	SAVE2x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 4), BO		// number of values in B
-        leaq    (AO, %rax, 2), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK				// number of values in A
-#endif
-
-
-.L4_40:
-	testq	$1, M		
-	jz	.L4_100		// to next 3 lines of N
-
-	ALIGN_4
-
-.L4_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,4), BO		// add number of values in B
-	leaq	(AO,%rax,1), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT1x4
-
-	sarq	$3,%rax
-	je	.L4_46
-
-	ALIGN_4
-
-.L4_42:
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	dec %rax
-	jne	.L4_42
-	ALIGN_4
-
-.L4_46:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_49
-
-	ALIGN_4
-
-.L4_47:
-
-	KERNEL1x4_SUB
-
-	dec	%rax
-	jne	.L4_47
-	ALIGN_4
-
-
-.L4_49:
-
-	SAVE1x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 4), BO		// number of values in B
-        leaq    (AO, %rax, 1), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK				// number of values in A
-#endif
-
-.L4_100:
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $4, KK				// number of values in B
-#endif
-
-
-	movq	K, %rax
-	salq	$2, %rax		// * 4
-	leaq	(B , %rax, SIZE), B
-	decq	J			// j --
-	jg	.L4_10
-
-
-
-
-/***************************************************************************************************************/
-
-.L2_0:
-
-	movq	Nmod12, J		
-	testq	$2, J
-	je	.L1_0
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,2), BO		// add number of values in B
-	leaq	(AO,%rax,4), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT4x2
-
-	sarq $3, %rax			//  K / 8
-
-	je	.L2_16
-
-	ALIGN_5
-
-.L2_12:
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	dec	%rax
-	jne	.L2_12
-
-
-.L2_16:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL4x2_SUB
-
-	dec	%rax
-	jne	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 2), BO		// number of values in B
-        leaq    (AO, %rax, 4), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK				// number of values in A
-#endif
-
-
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L2_100			// to next 16 lines of N
-
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,2), BO		// add number of values in B
-	leaq	(AO,%rax,2), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT2x2
-
-	sarq	$3, %rax
-	je	.L2_36
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	dec %rax
-	jne	.L2_32
-
-.L2_36:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	dec %rax
-	jne	.L2_37
-
-
-.L2_39:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 2), BO		// number of values in B
-        leaq    (AO, %rax, 2), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK				// number of values in A
-#endif
-
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_100		// to next 3 lines of N
-
-.L2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,2), BO		// add number of values in B
-	leaq	(AO,%rax,1), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT1x2
-
-	sarq	$3,%rax
-	je	.L2_46
-
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	dec %rax
-	jne	.L2_42
-
-.L2_46:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	dec	%rax
-	jne	.L2_47
-
-.L2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax * SIZE
-        leaq    (BO, %rax, 2), BO		// number of values in B
-        leaq    (AO, %rax, 1), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK				// number of values in A
-#endif
-
-
-.L2_100:
-
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK				// number of values in B
-#endif
-
-	movq	K, %rax
-	salq	$1, %rax		// * 2
-	leaq	(B , %rax, SIZE), B
-
-/***************************************************************************************************************/
-
-.L1_0:
-
-	movq	Nmod12, J		
-	testq	$1, J
-	je	.L999
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,1), BO		// add number of values in B
-	leaq	(AO,%rax,4), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT4x1
-
-	sarq	$3, %rax			//  K / 8
-	je	.L1_16
-
-	ALIGN_5
-
-.L1_12:
-
-	KERNEL4x1
-
-	dec	%rax
-	jne	.L1_12
-
-
-.L1_16:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL4x1_SUB
-
-	dec	%rax
-	jne	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE4x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax * SIZE
-        leaq    (BO, %rax, 1), BO		// number of values in B
-        leaq    (AO, %rax, 4), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK				// number of values in A
-#endif
-
-
-	decq	I			# i --
-	jg	.L1_11
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L1_100	
-
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,1), BO		// add number of values in B
-	leaq	(AO,%rax,2), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT2x1
-
-	sarq	$3, %rax
-	je	.L1_36
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-
-	dec %rax
-	jne	.L1_32
-
-.L1_36:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	dec %rax
-	jne	.L1_37
-
-.L1_39:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax * SIZE
-        leaq    (BO, %rax, 1), BO		// number of values in B
-        leaq    (AO, %rax, 2), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK				// number of values in A
-#endif
-
-
-.L1_40:
-	testq	$1, M		
-	jz	.L1_100		// to next 3 lines of N
-
-
-.L1_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,1), BO		// add number of values in B
-	leaq	(AO,%rax,1), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT1x1
-
-	sarq	$3,%rax
-	je	.L1_46
-
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	dec %rax
-	jne	.L1_42
-
-.L1_46:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	dec	%rax
-	jne	.L1_47
-
-
-.L1_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax * SIZE
-        leaq    (BO, %rax, 1), BO		// number of values in B
-        leaq    (AO, %rax, 1), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK				// number of values in A
-#endif
-
-
-
-.L1_100:
-
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $1, KK				// number of values in B
-#endif
-
-
-
-.L999:
-
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-
-
-
-#endif
+/*********************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+
+/*********************************************************************
+* 2013/10/28 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+
+*
+*
+* 2013/10/27 Saar
+* Parameter:
+*       DGEMM_DEFAULT_UNROLL_N  4
+*       DGEMM_DEFAULT_UNROLL_M  4
+*       DGEMM_DEFAULT_P         512
+*       DGEMM_DEFAULT_Q         256
+*	A_PR1			512
+*	B_PR1			512
+*
+*
+* Performance at 9216x9216x9216:
+*       1 thread:       53.3 GFLOPS (MKL:  54)
+*       2 threads:     100.0 GFLOPS (MKL:  97)
+*       3 threads:     147.0 GFLOPS (MKL: 133)
+*       4 threads:     184.0 GFLOPS (MKL: 170)
+*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+#define BO3	%rbp
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+#define L_BUFFER_SIZE 256*8*12+4096
+
+#else
+
+#define STACKSIZE 256
+#define L_BUFFER_SIZE 128*8*12+512
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+
+#define Ndiv12	 24(%rsp)
+#define Nmod12	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 4(%rsp);\
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+#define	A_PR1	512
+#define	B_PR1	512
+
+/*******************************************************************************************
+* Macro definitions
+*******************************************************************************************/
+
+.macro INIT4x12
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+	vxorpd		%ymm8 , %ymm8 , %ymm8
+	vxorpd		%ymm9 , %ymm9 , %ymm9
+	vxorpd		%ymm10, %ymm10, %ymm10
+	vxorpd		%ymm11, %ymm11, %ymm11
+	vxorpd		%ymm12, %ymm12, %ymm12
+	vxorpd		%ymm13, %ymm13, %ymm13
+	vxorpd		%ymm14, %ymm14, %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+
+.endm
+
+.macro KERNEL4x12_I
+	prefetcht0	A_PR1(AO)
+	vmovups		-12 * SIZE(BO), %ymm1
+	prefetcht0	B_PR1(BO)
+	vmovups 	-16 * SIZE(AO), %ymm0
+	prefetcht0	B_PR1+64(BO)
+	vmovups		 -8 * SIZE(BO), %ymm2
+	prefetcht0	B_PR1+128(BO)
+	vmovups		 -4 * SIZE(BO), %ymm3
+	vmulpd  	%ymm0 ,%ymm1  , %ymm4
+	prefetcht0	B_PR1+192(BO)
+	vmulpd  	%ymm0 ,%ymm2  , %ymm8
+	vmulpd  	%ymm0 ,%ymm3  , %ymm12
+	prefetcht0	B_PR1+256(BO)
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm5
+	vmulpd  	%ymm0 ,%ymm2  , %ymm9
+	vmulpd  	%ymm0 ,%ymm3  , %ymm13
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm6
+	vmulpd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 12*SIZE, BO
+	vmulpd  	%ymm0 ,%ymm3  , %ymm14
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmulpd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vmulpd  	%ymm0 ,%ymm3  , %ymm15
+	vmovups		 -4 * SIZE(BO), %ymm3
+
+.endm
+
+.macro KERNEL4x12_M1
+	prefetcht0	A_PR1(AO)
+	vmovups 	-16 * SIZE(AO), %ymm0
+	prefetcht0	B_PR1(BO)
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	prefetcht0	B_PR1+64(BO)
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	prefetcht0	B_PR1+128(BO)
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+	vmovups		 -4 * SIZE(BO), %ymm3
+
+.endm
+
+.macro KERNEL4x12_M2
+	vmovups 	-12 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 8*SIZE, AO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		  0 * SIZE(BO), %ymm1
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		  4 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+	vmovups		  8 * SIZE(BO), %ymm3
+	addq		$ 24*SIZE, BO
+.endm
+
+
+.macro KERNEL4x12_E
+	vmovups 	-12 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 8*SIZE, AO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+	addq		$ 12*SIZE, BO
+.endm
+
+.macro KERNEL4x12_SUB
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vmovups		 -4 * SIZE(BO), %ymm3
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	addq		$ 12*SIZE, BO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+	addq		$ 4*SIZE, AO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+
+.endm
+
+
+.macro SAVE4x12
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+	vmulpd	%ymm0 , %ymm9 , %ymm9
+	vmulpd	%ymm0 , %ymm10, %ymm10
+	vmulpd	%ymm0 , %ymm11, %ymm11
+
+	vmulpd	%ymm0 , %ymm12, %ymm12
+	vmulpd	%ymm0 , %ymm13, %ymm13
+	vmulpd	%ymm0 , %ymm14, %ymm14
+	vmulpd	%ymm0 , %ymm15, %ymm15
+
+	vpermpd $ 0xb1 , %ymm5, %ymm5
+	vpermpd $ 0xb1 , %ymm7, %ymm7
+
+	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
+	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
+	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
+	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+
+	vpermpd $ 0x1b , %ymm2, %ymm2
+	vpermpd $ 0x1b , %ymm3, %ymm3
+	vpermpd $ 0xb1 , %ymm2, %ymm2
+	vpermpd $ 0xb1 , %ymm3, %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %ymm4, %ymm4
+	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
+	vaddpd 	               (%rax), %ymm6, %ymm6
+	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm6 ,  	(%rax)
+	vmovups	%ymm7 ,  	(%rax, LDC)
+
+	prefetcht0	32(CO1)
+	prefetcht0	32(CO1,LDC)
+	prefetcht0	32(%rax)
+	prefetcht0	32(%rax,LDC)
+
+	vpermpd $ 0xb1 , %ymm9 , %ymm9
+	vpermpd $ 0xb1 , %ymm11, %ymm11
+
+	vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
+	vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
+	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
+	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
+
+	vpermpd $ 0x1b , %ymm2, %ymm2
+	vpermpd $ 0x1b , %ymm3, %ymm3
+	vpermpd $ 0xb1 , %ymm2, %ymm2
+	vpermpd $ 0xb1 , %ymm3, %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %ymm4, %ymm4
+	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
+	vaddpd 	                (%rbp), %ymm6, %ymm6
+	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(%rax)
+	vmovups	%ymm5 ,  	(%rax, LDC)
+	vmovups	%ymm6 ,  	(%rbp)
+	vmovups	%ymm7 ,  	(%rbp, LDC)
+
+	prefetcht0	32(%rax)
+	prefetcht0	32(%rax,LDC)
+	prefetcht0	32(%rbp)
+	prefetcht0	32(%rbp,LDC)
+
+	vpermpd $ 0xb1 , %ymm13, %ymm13
+	vpermpd $ 0xb1 , %ymm15, %ymm15
+
+	vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
+	vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
+	vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
+	vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
+
+	vpermpd $ 0x1b , %ymm2, %ymm2
+	vpermpd $ 0x1b , %ymm3, %ymm3
+	vpermpd $ 0xb1 , %ymm2, %ymm2
+	vpermpd $ 0xb1 , %ymm3, %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+
+
+	leaq	(%rax, LDC, 4), %rax
+	leaq	(%rbp, LDC, 4), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %ymm4, %ymm4
+	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
+	vaddpd 	                (%rbp), %ymm6, %ymm6
+	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(%rax)
+	vmovups	%ymm5 ,  	(%rax, LDC)
+	vmovups	%ymm6 ,  	(%rbp)
+	vmovups	%ymm7 ,  	(%rbp, LDC)
+
+	prefetcht0	32(%rax)
+	prefetcht0	32(%rax,LDC)
+	prefetcht0	32(%rbp)
+	prefetcht0	32(%rbp,LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+
+.macro INIT2x12
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+	vxorpd		%xmm8 , %xmm8 , %xmm8
+	vxorpd		%xmm9 , %xmm9 , %xmm9
+	vxorpd		%xmm10, %xmm10, %xmm10
+	vxorpd		%xmm11, %xmm11, %xmm11
+	vxorpd		%xmm12, %xmm12, %xmm12
+	vxorpd		%xmm13, %xmm13, %xmm13
+	vxorpd		%xmm14, %xmm14, %xmm14
+	vxorpd		%xmm15, %xmm15, %xmm15
+
+.endm
+
+.macro KERNEL2x12_SUB
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmovddup	-11 * SIZE(BO), %xmm2
+	vmovddup	-10 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
+	vmovddup	 -9 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
+	vmovddup	 -8 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	vmovddup	 -7 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm7
+	vmovddup	 -6 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm8
+	vmovddup	 -5 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm9
+	vmovddup	 -4 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm10
+	vmovddup	 -3 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm11
+	vmovddup	 -2 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm12
+	vmovddup	 -1 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm13
+	addq		$ 12*SIZE, BO
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm14
+	addq		$ 2*SIZE, AO
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm15
+
+.endm
+
+.macro SAVE2x12
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+	vmulpd	%xmm0 , %xmm8 , %xmm8
+	vmulpd	%xmm0 , %xmm9 , %xmm9
+	vmulpd	%xmm0 , %xmm10, %xmm10
+	vmulpd	%xmm0 , %xmm11, %xmm11
+
+	vmulpd	%xmm0 , %xmm12, %xmm12
+	vmulpd	%xmm0 , %xmm13, %xmm13
+	vmulpd	%xmm0 , %xmm14, %xmm14
+	vmulpd	%xmm0 , %xmm15, %xmm15
+
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %xmm4, %xmm4
+	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddpd 	               (%rax), %xmm6, %xmm6
+	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(%rax)
+	vmovups	%xmm7 ,  	(%rax, LDC)
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %xmm8 , %xmm4
+	vaddpd 	           (%rax, LDC), %xmm9 , %xmm5
+	vaddpd 	                (%rbp), %xmm10, %xmm6
+	vaddpd 	           (%rbp, LDC), %xmm11, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(%rax)
+	vmovups	%xmm5 ,  	(%rax, LDC)
+	vmovups	%xmm6 ,  	(%rbp)
+	vmovups	%xmm7 ,  	(%rbp, LDC)
+
+
+	leaq	(%rax, LDC, 4), %rax
+	leaq	(%rbp, LDC, 4), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %xmm12, %xmm4
+	vaddpd 	           (%rax, LDC), %xmm13, %xmm5
+	vaddpd 	                (%rbp), %xmm14, %xmm6
+	vaddpd 	           (%rbp, LDC), %xmm15, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(%rax)
+	vmovups	%xmm5 ,  	(%rax, LDC)
+	vmovups	%xmm6 ,  	(%rbp)
+	vmovups	%xmm7 ,  	(%rbp, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+
+.macro INIT1x12
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+	vxorpd		%xmm8 , %xmm8 , %xmm8
+	vxorpd		%xmm9 , %xmm9 , %xmm9
+	vxorpd		%xmm10, %xmm10, %xmm10
+	vxorpd		%xmm11, %xmm11, %xmm11
+	vxorpd		%xmm12, %xmm12, %xmm12
+	vxorpd		%xmm13, %xmm13, %xmm13
+	vxorpd		%xmm14, %xmm14, %xmm14
+	vxorpd		%xmm15, %xmm15, %xmm15
+
+.endm
+
+.macro KERNEL1x12_SUB
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vmovsd	-10 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vmovsd	 -9 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	vmovsd	 -8 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
+	vmovsd	 -7 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm7
+	vmovsd	 -6 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm8
+	vmovsd	 -5 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm9
+	vmovsd	 -4 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm10
+	vmovsd	 -3 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm11
+	vmovsd	 -2 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm12
+	vmovsd	 -1 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm13
+	addq		$ 12*SIZE, BO
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm14
+	addq		$ 1*SIZE, AO
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm15
+
+.endm
+
+.macro SAVE1x12
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+	vmulsd	%xmm0 , %xmm7 , %xmm7
+
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+	vmulsd	%xmm0 , %xmm9 , %xmm9
+	vmulsd	%xmm0 , %xmm10, %xmm10
+	vmulsd	%xmm0 , %xmm11, %xmm11
+
+	vmulsd	%xmm0 , %xmm12, %xmm12
+	vmulsd	%xmm0 , %xmm13, %xmm13
+	vmulsd	%xmm0 , %xmm14, %xmm14
+	vmulsd	%xmm0 , %xmm15, %xmm15
+
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddsd 	               (%rax), %xmm6, %xmm6
+	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(%rax)
+	vmovsd	%xmm7 ,  	(%rax, LDC)
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (%rax), %xmm8 , %xmm4
+	vaddsd 	           (%rax, LDC), %xmm9 , %xmm5
+	vaddsd 	                (%rbp), %xmm10, %xmm6
+	vaddsd 	           (%rbp, LDC), %xmm11, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(%rax)
+	vmovsd	%xmm5 ,  	(%rax, LDC)
+	vmovsd	%xmm6 ,  	(%rbp)
+	vmovsd	%xmm7 ,  	(%rbp, LDC)
+
+
+	leaq	(%rax, LDC, 4), %rax
+	leaq	(%rbp, LDC, 4), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (%rax), %xmm12, %xmm4
+	vaddsd 	           (%rax, LDC), %xmm13, %xmm5
+	vaddsd 	                (%rbp), %xmm14, %xmm6
+	vaddsd 	           (%rbp, LDC), %xmm15, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(%rax)
+	vmovsd	%xmm5 ,  	(%rax, LDC)
+	vmovsd	%xmm6 ,  	(%rbp)
+	vmovsd	%xmm7 ,  	(%rbp, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT4x4
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+
+.endm
+
+.macro KERNEL4x4_I
+	prefetcht0	A_PR1(AO)
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm4
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm5
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm6
+
+	addq		$ 4*SIZE, BO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+
+.endm
+
+.macro KERNEL4x4_M1
+	prefetcht0	A_PR1(AO)
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+
+.endm
+
+.macro KERNEL4x4_M2
+	vmovups 	-12 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+
+	addq		$ 8*SIZE, AO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		 -8 * SIZE(BO), %ymm1
+	addq		$ 8*SIZE, BO
+.endm
+
+
+.macro KERNEL4x4_E
+	vmovups 	-12 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+
+	addq		$ 8*SIZE, AO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	addq		$ 4*SIZE, BO
+.endm
+
+.macro KERNEL4x4_SUB
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	addq		$ 4*SIZE, BO
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	addq		$ 4*SIZE, AO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+
+.endm
+
+.macro SAVE4x4
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+
+	vpermpd $ 0xb1 , %ymm5, %ymm5
+	vpermpd $ 0xb1 , %ymm7, %ymm7
+
+	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
+	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
+	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
+	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+
+	vpermpd $ 0x1b , %ymm2, %ymm2
+	vpermpd $ 0x1b , %ymm3, %ymm3
+	vpermpd $ 0xb1 , %ymm2, %ymm2
+	vpermpd $ 0xb1 , %ymm3, %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %ymm4, %ymm4
+	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
+	vaddpd 	               (%rax), %ymm6, %ymm6
+	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm6 ,  	(%rax)
+	vmovups	%ymm7 ,  	(%rax, LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT2x4
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+
+.endm
+
+
+.macro KERNEL2x4_SUB
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-11 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
+	vmovddup	-10 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
+	vmovddup	 -9 * SIZE(BO), %xmm8
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	addq		$ 4*SIZE, BO
+	vfmadd231pd  	%xmm0 ,%xmm8  , %xmm7
+	addq		$ 2*SIZE, AO
+
+.endm
+
+
+.macro SAVE2x4
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+        leaq    (CO1, LDC, 2), %rax     
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %xmm4, %xmm4
+	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddpd 	               (%rax), %xmm6, %xmm6
+	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(%rax)
+	vmovups	%xmm7 ,  	(%rax, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT1x4
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+
+.endm
+
+
+.macro KERNEL1x4_SUB
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vmovsd	-10 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	vmovsd	 -9 * SIZE(BO), %xmm8
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
+	addq		$ 4*SIZE, BO
+	vfmadd231sd  	%xmm0 ,%xmm8  , %xmm7
+	addq		$ 1*SIZE, AO
+
+.endm
+
+
+.macro SAVE1x4
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+	vmulsd	%xmm0 , %xmm7 , %xmm7
+
+        leaq    (CO1, LDC, 2), %rax     
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddsd 	               (%rax), %xmm6, %xmm6
+	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(%rax)
+	vmovsd	%xmm7 ,  	(%rax, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT4x2
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+
+.endm
+
+
+.macro KERNEL4x2_SUB
+	vmovddup	-12 * SIZE(BO), %xmm2
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovups 	-14 * SIZE(AO), %xmm1
+	vmovddup	-11 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
+	vfmadd231pd  	%xmm1 ,%xmm2  , %xmm5
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	vfmadd231pd  	%xmm1 ,%xmm3  , %xmm7
+	addq		$ 2*SIZE, BO
+	addq		$ 4*SIZE, AO
+
+.endm
+
+
+.macro SAVE4x2
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %xmm4, %xmm4
+	vaddpd 	        2 * SIZE(CO1)     , %xmm5, %xmm5
+	vaddpd 	                (CO1, LDC), %xmm6, %xmm6
+	vaddpd 	        2 * SIZE(CO1, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 , 2 * SIZE(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm7 , 2 * SIZE(CO1, LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT2x2
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+
+.endm
+
+
+.macro KERNEL2x2_SUB
+	vmovddup	-12 * SIZE(BO), %xmm2
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-11 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	addq		$ 2*SIZE, BO
+	addq		$ 2*SIZE, AO
+
+.endm
+
+
+.macro SAVE2x2
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %xmm4, %xmm4
+	vaddpd 	                (CO1, LDC), %xmm6, %xmm6
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT1x2
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+
+.endm
+
+
+.macro KERNEL1x2_SUB
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	addq		$ 2*SIZE, BO
+	addq		$ 1*SIZE, AO
+
+.endm
+
+
+.macro SAVE1x2
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT4x1
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+
+.endm
+
+
+.macro KERNEL4x1
+
+	vbroadcastsd	-12 * SIZE(BO), %ymm0
+	vbroadcastsd	-11 * SIZE(BO), %ymm1
+	vbroadcastsd	-10 * SIZE(BO), %ymm2
+	vbroadcastsd	-9  * SIZE(BO), %ymm3
+
+	vfmadd231pd  	-16 * SIZE(AO) ,%ymm0  , %ymm4
+	vfmadd231pd  	-12 * SIZE(AO) ,%ymm1  , %ymm5
+
+	vbroadcastsd	-8  * SIZE(BO), %ymm0
+	vbroadcastsd	-7  * SIZE(BO), %ymm1
+
+	vfmadd231pd  	-8  * SIZE(AO) ,%ymm2  , %ymm6
+	vfmadd231pd  	-4  * SIZE(AO) ,%ymm3  , %ymm7
+
+	vbroadcastsd	-6  * SIZE(BO), %ymm2
+	vbroadcastsd	-5  * SIZE(BO), %ymm3
+
+	vfmadd231pd  	 0  * SIZE(AO) ,%ymm0  , %ymm4
+	vfmadd231pd  	 4  * SIZE(AO) ,%ymm1  , %ymm5
+	vfmadd231pd  	 8  * SIZE(AO) ,%ymm2  , %ymm6
+	vfmadd231pd  	 12 * SIZE(AO) ,%ymm3  , %ymm7
+
+	addq		$ 8 *SIZE, BO
+	addq		$ 32*SIZE, AO
+
+.endm
+
+
+.macro KERNEL4x1_SUB
+	vbroadcastsd	-12 * SIZE(BO), %ymm2
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm4
+	addq		$ 1*SIZE, BO
+	addq		$ 4*SIZE, AO
+
+.endm
+
+
+.macro SAVE4x1
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vaddpd	%ymm4,%ymm5, %ymm4 
+	vaddpd	%ymm6,%ymm7, %ymm6 
+	vaddpd	%ymm4,%ymm6, %ymm4 
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %ymm4, %ymm4
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT2x1
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+
+.endm
+
+
+.macro KERNEL2x1_SUB
+	vmovddup	-12 * SIZE(BO), %xmm2
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
+	addq		$ 1*SIZE, BO
+	addq		$ 2*SIZE, AO
+
+.endm
+
+
+.macro SAVE2x1
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %xmm4, %xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT1x1
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+
+.endm
+
+
+.macro KERNEL1x1_SUB
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	addq		$ 1*SIZE, BO
+	addq		$ 1*SIZE, AO
+
+.endm
+
+
+.macro SAVE1x1
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+
+	vmovups	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $12,  %rdi
+        divq    %rdi                    //    N / 12
+        movq    %rax, Ndiv12             //    N / 12
+        movq    %rdx, Nmod12             //    N % 12
+
+
+	movq	Ndiv12,  J
+	cmpq	$ 0, J
+	je	.L4_0
+	ALIGN_4
+
+.L12_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $2,%rax                 // K * 4 ; read 2 values
+        movq    B, BO1
+        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
+        leaq    (BO2,%rax, SIZE), BO3     // next offset to BO2
+
+
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+	sarq	$1 , %rax		// K / 2
+	jz	.L12_01a_2
+        ALIGN_4
+
+.L12_01a_1:
+
+        prefetcht0 512(BO1)
+        prefetcht0 512(BO2)
+        prefetcht0 512(BO3)
+        prefetchw  512(BO)
+
+
+	vmovups	0 * SIZE(BO1), %ymm1
+	vmovups	4 * SIZE(BO1), %ymm5
+	vmovups	0 * SIZE(BO2), %ymm2
+	vmovups	4 * SIZE(BO2), %ymm6
+	vmovups	0 * SIZE(BO3), %ymm3
+	vmovups	4 * SIZE(BO3), %ymm7
+
+	vmovups	%ymm1,  0 * SIZE(BO)
+	vmovups	%ymm2,  4 * SIZE(BO)
+	vmovups	%ymm3,  8 * SIZE(BO)
+
+	vmovups	%ymm5, 12 * SIZE(BO)
+	vmovups	%ymm6, 16 * SIZE(BO)
+	vmovups	%ymm7, 20 * SIZE(BO)
+
+	addq	$ 8 * SIZE ,BO1
+	addq	$ 8 * SIZE ,BO2
+	addq	$ 8 * SIZE ,BO3
+	addq    $ 24 *SIZE ,BO
+
+	decq	%rax
+	jnz	.L12_01a_1
+
+
+
+.L12_01a_2:
+
+	movq    K, %rax
+        andq    $1, %rax                // K % 2
+        jz      .L12_03c
+        ALIGN_4
+
+
+.L12_02b:
+
+	vmovups	0 * SIZE(BO1), %ymm1
+	vmovups	0 * SIZE(BO2), %ymm2
+	vmovups	0 * SIZE(BO3), %ymm3
+	vmovups	%ymm1, 0 * SIZE(BO)
+	vmovups	%ymm2, 4 * SIZE(BO)
+	vmovups	%ymm3, 8 * SIZE(BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO2
+	addq	$ 4*SIZE,BO3
+	addq	$ 12*SIZE,BO
+	decq	%rax
+	jnz	.L12_02b
+
+.L12_03c:
+
+	movq	BO3, B			// next offset of B
+
+.L12_10:
+	movq	C, CO1
+	leaq	(C, LDC, 8), C		 
+	leaq	(C, LDC, 4), C		// c += 12 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L12_20
+
+	ALIGN_4
+
+.L12_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+        movq    K, %rax
+
+	sarq $3, %rax			//  K / 8
+	cmpq $2, %rax
+
+	jl	.L12_13
+
+
+	KERNEL4x12_I
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	subq $2, %rax
+	je	.L12_12a
+
+	ALIGN_5
+.L12_12:
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	dec	%rax
+	jne	.L12_12
+
+.L12_12a:
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_E
+
+	jmp .L12_16
+
+
+.L12_13:
+
+	test $1, %rax
+	jz .L12_14
+
+	KERNEL4x12_I
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_E
+
+	jmp .L12_16
+
+
+.L12_14:
+
+	INIT4x12
+
+
+.L12_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L12_19
+
+	ALIGN_4
+
+.L12_17:
+
+	KERNEL4x12_SUB
+
+	dec	%rax
+	jne	.L12_17
+	ALIGN_4
+
+
+.L12_19:
+
+	SAVE4x12
+
+	decq	I			# i --
+	jne	.L12_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L12_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L12_100			// to next 16 lines of N
+
+
+.L12_30:
+	testq	$2, M		
+	jz	.L12_40
+
+	ALIGN_4
+
+.L12_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x12
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L12_36
+	ALIGN_4
+
+.L12_32:
+
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+
+	dec %rax
+	jne	.L12_32
+	ALIGN_4
+
+.L12_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L12_39
+
+	ALIGN_4
+
+.L12_37:
+
+	KERNEL2x12_SUB
+
+	dec %rax
+	jne	.L12_37
+	ALIGN_4
+
+
+.L12_39:
+
+	SAVE2x12
+
+	ALIGN_4
+
+.L12_40:
+	testq	$1, M		
+	jz	.L12_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L12_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x12
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L12_46
+
+	ALIGN_4
+
+.L12_42:
+
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+
+
+	dec %rax
+	jne	.L12_42
+	ALIGN_4
+
+.L12_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L12_49
+
+	ALIGN_4
+
+.L12_47:
+
+	KERNEL1x12_SUB
+
+	dec	%rax
+	jne	.L12_47
+	ALIGN_4
+
+
+.L12_49:
+
+	SAVE1x12
+
+	ALIGN_4
+	
+.L12_100:
+
+	decq	J			// j --
+	jg	.L12_01
+
+
+.L4_0:
+
+	cmpq	$ 0, Nmod12		// N % 12 == 0
+	je	.L999
+
+	movq	Nmod12, J		
+	sarq	$2, J			// j = j / 4
+	je	.L2_0
+
+.L4_10:
+	movq	C, CO1
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+        movq    K, %rax
+
+	sarq	$3, %rax			//  K / 8
+	cmpq    $2, %rax
+	jl	.L4_13
+
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subq $2, %rax
+	je	.L4_12a
+
+	ALIGN_5
+
+.L4_12:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	dec	%rax
+	jne	.L4_12
+
+.L4_12a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_13:
+
+	test $1, %rax
+	jz .L4_14
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_14:
+
+	INIT4x4
+
+
+.L4_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL4x4_SUB
+
+	dec	%rax
+	jne	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE4x4
+
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L4_100			// to next 16 lines of N
+
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x4
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L4_36
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_32
+	ALIGN_4
+
+.L4_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_37
+
+
+.L4_39:
+
+	SAVE2x4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L4_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x4
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L4_46
+
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	dec %rax
+	jne	.L4_42
+	ALIGN_4
+
+.L4_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	dec	%rax
+	jne	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+	ALIGN_4
+	
+.L4_100:
+
+	movq	K, %rax
+	salq	$2, %rax		// * 4
+	leaq	(B , %rax, SIZE), B
+	decq	J			// j --
+	jg	.L4_10
+
+
+
+
+/***************************************************************************************************************/
+
+.L2_0:
+
+	movq	Nmod12, J		
+	testq	$2, J
+	je	.L1_0
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+	INIT4x2
+
+        movq    K, %rax
+	sarq $3, %rax			//  K / 8
+
+	je	.L2_16
+
+	ALIGN_5
+
+.L2_12:
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_12
+
+
+.L2_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE4x2
+
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L2_100			// to next 16 lines of N
+
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x2
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L2_36
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_32
+
+.L2_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_37
+
+
+.L2_39:
+
+	SAVE2x2
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_100		// to next 3 lines of N
+
+.L2_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x2
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L2_46
+
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	dec %rax
+	jne	.L2_42
+
+.L2_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	dec	%rax
+	jne	.L2_47
+
+.L2_49:
+
+	SAVE1x2
+
+.L2_100:
+
+	movq	K, %rax
+	salq	$1, %rax		// * 2
+	leaq	(B , %rax, SIZE), B
+
+/***************************************************************************************************************/
+
+.L1_0:
+
+	movq	Nmod12, J		
+	testq	$1, J
+	je	.L999
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+	INIT4x1
+
+        movq    K, %rax
+
+	sarq	$3, %rax			//  K / 8
+	je	.L1_16
+
+	ALIGN_5
+
+.L1_12:
+
+	KERNEL4x1
+
+	dec	%rax
+	jne	.L1_12
+
+
+.L1_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL4x1_SUB
+
+	dec	%rax
+	jne	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE4x1
+
+	decq	I			# i --
+	jg	.L1_11
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L1_100	
+
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x1
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L1_36
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+
+	dec %rax
+	jne	.L1_32
+
+.L1_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	dec %rax
+	jne	.L1_37
+
+.L1_39:
+
+	SAVE2x1
+
+.L1_40:
+	testq	$1, M		
+	jz	.L1_100		// to next 3 lines of N
+
+
+.L1_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x1
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L1_46
+
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	dec %rax
+	jne	.L1_42
+
+.L1_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	dec	%rax
+	jne	.L1_47
+
+
+.L1_49:
+
+	SAVE1x1
+
+.L1_100:
+
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovups	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	vmovsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $4,  %rdi
+        divq    %rdi                     //    N / 4
+        movq    %rax, Ndiv12             //    N / 4
+        movq    %rdx, Nmod12             //    N % 4
+
+#ifdef TRMMKERNEL
+        vmovsd  %xmm12, OFFSET
+        vmovsd  %xmm12, KK
+#ifndef LEFT
+        negq    KK
+#endif  
+#endif
+
+
+
+	movq	Ndiv12,  J
+	cmpq	$ 0, J
+	je	.L2_0
+	ALIGN_4
+
+.L4_10:
+	movq	C, CO1
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,4), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	sarq	$3, %rax			//  K / 8
+	cmpq    $2, %rax
+	jl	.L4_13
+
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subq $2, %rax
+	je	.L4_12a
+
+	ALIGN_5
+
+.L4_12:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	dec	%rax
+	jne	.L4_12
+
+.L4_12a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_13:
+
+	test $1, %rax
+	jz .L4_14
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_14:
+
+	INIT4x4
+
+
+.L4_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL4x4_SUB
+
+	dec	%rax
+	jne	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 4), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L4_100			// to next 16 lines of N
+
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,4), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x4
+
+	sarq	$3, %rax
+	je	.L4_36
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_32
+	ALIGN_4
+
+.L4_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_37
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 4), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L4_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,4), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x4
+
+	sarq	$3,%rax
+	je	.L4_46
+
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	dec %rax
+	jne	.L4_42
+	ALIGN_4
+
+.L4_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	dec	%rax
+	jne	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 4), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+.L4_100:
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK				// number of values in B
+#endif
+
+
+	movq	K, %rax
+	salq	$2, %rax		// * 4
+	leaq	(B , %rax, SIZE), B
+	decq	J			// j --
+	jg	.L4_10
+
+
+
+
+/***************************************************************************************************************/
+
+.L2_0:
+
+	movq	Nmod12, J		
+	testq	$2, J
+	je	.L1_0
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,2), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT4x2
+
+	sarq $3, %rax			//  K / 8
+
+	je	.L2_16
+
+	ALIGN_5
+
+.L2_12:
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_12
+
+
+.L2_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 2), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L2_100			// to next 16 lines of N
+
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,2), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x2
+
+	sarq	$3, %rax
+	je	.L2_36
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_32
+
+.L2_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_37
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 2), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_100		// to next 3 lines of N
+
+.L2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,2), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x2
+
+	sarq	$3,%rax
+	je	.L2_46
+
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	dec %rax
+	jne	.L2_42
+
+.L2_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	dec	%rax
+	jne	.L2_47
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 2), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+
+.L2_100:
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK				// number of values in B
+#endif
+
+	movq	K, %rax
+	salq	$1, %rax		// * 2
+	leaq	(B , %rax, SIZE), B
+
+/***************************************************************************************************************/
+
+.L1_0:
+
+	movq	Nmod12, J		
+	testq	$1, J
+	je	.L999
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,1), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT4x1
+
+	sarq	$3, %rax			//  K / 8
+	je	.L1_16
+
+	ALIGN_5
+
+.L1_12:
+
+	KERNEL4x1
+
+	dec	%rax
+	jne	.L1_12
+
+
+.L1_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL4x1_SUB
+
+	dec	%rax
+	jne	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 1), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+
+	decq	I			# i --
+	jg	.L1_11
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L1_100	
+
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,1), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x1
+
+	sarq	$3, %rax
+	je	.L1_36
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+
+	dec %rax
+	jne	.L1_32
+
+.L1_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	dec %rax
+	jne	.L1_37
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 1), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L1_40:
+	testq	$1, M		
+	jz	.L1_100		// to next 3 lines of N
+
+
+.L1_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,1), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x1
+
+	sarq	$3,%rax
+	je	.L1_46
+
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	dec %rax
+	jne	.L1_42
+
+.L1_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	dec	%rax
+	jne	.L1_47
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 1), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+
+
+.L1_100:
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $1, KK				// number of values in B
+#endif
+
+
+
+.L999:
+
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+
+
+#endif
diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
index 19e32ef2c..adaa28bbc 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_haswell.S
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -1,5153 +1,5153 @@
-/*********************************************************************************
-Copyright (c) 2015, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-#define BO3	%rbp
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-#define L_BUFFER_SIZE 256*8*12+4096
-
-#else
-
-#define STACKSIZE 256
-#define L_BUFFER_SIZE 128*8*12+512
-
-#define OLD_A		40 + STACKSIZE(%rsp)
-#define OLD_B		48 + STACKSIZE(%rsp)
-#define OLD_C		56 + STACKSIZE(%rsp)
-#define OLD_LDC		64 + STACKSIZE(%rsp)
-#define OLD_OFFSET	72 + STACKSIZE(%rsp)
-
-#endif
-
-
-#define Ndiv12	 24(%rsp)
-#define Nmod12	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA	 48(%rsp)
-#define OFFSET	 56(%rsp)
-#define KK	 64(%rsp)
-#define KKK	 72(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 4(%rsp);\
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 3(%rsp);\
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 2(%rsp);\
-        movl    $ 0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $ 0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-#define	A_PR1	512
-#define	B_PR1	160
-#define BROADCASTKERNEL
-
-/*******************************************************************************************
-* Macro definitions
-*******************************************************************************************/
-
-.macro INIT4x12
-
-	vxorpd		%ymm4 , %ymm4 , %ymm4
-	vxorpd		%ymm5 , %ymm5 , %ymm5
-	vxorpd		%ymm6 , %ymm6 , %ymm6
-	vxorpd		%ymm7 , %ymm7 , %ymm7
-	vxorpd		%ymm8 , %ymm8 , %ymm8
-	vxorpd		%ymm9 , %ymm9 , %ymm9
-	vxorpd		%ymm10, %ymm10, %ymm10
-	vxorpd		%ymm11, %ymm11, %ymm11
-	vxorpd		%ymm12, %ymm12, %ymm12
-	vxorpd		%ymm13, %ymm13, %ymm13
-	vxorpd		%ymm14, %ymm14, %ymm14
-	vxorpd		%ymm15, %ymm15, %ymm15
-
-.endm
-
-.macro KERNEL4x12_I
-	prefetcht0	A_PR1(AO)
-	vmovups		-12 * SIZE(BO), %ymm1
-	prefetcht0	B_PR1(BO)
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -16 * SIZE(AO), %ymm0
-# else
-	vmovups 	-16 * SIZE(AO), %ymm0
-# endif
-	prefetcht0	B_PR1+64(BO)
-	vmovups		 -8 * SIZE(BO), %ymm2
-	prefetcht0	B_PR1+128(BO)
-	vmovups		 -4 * SIZE(BO), %ymm3
-	vmulpd  	%ymm0 ,%ymm1  , %ymm4
-	prefetcht0	B_PR1+192(BO)
-	vmulpd  	%ymm0 ,%ymm2  , %ymm8
-	vmulpd  	%ymm0 ,%ymm3  , %ymm12
-	prefetcht0	B_PR1+256(BO)
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -15 * SIZE(AO), %ymm0
-# else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-# endif
-	vmulpd  	%ymm0 ,%ymm1  , %ymm5
-	vmulpd  	%ymm0 ,%ymm2  , %ymm9
-	vmulpd  	%ymm0 ,%ymm3  , %ymm13
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -14 * SIZE(AO), %ymm0
-# else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-# endif
-	vmulpd  	%ymm0 ,%ymm1  , %ymm6
-	vmulpd  	%ymm0 ,%ymm2  , %ymm10
-
-	addq		$ 12*SIZE, BO
-	vmulpd  	%ymm0 ,%ymm3  , %ymm14
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -13 * SIZE(AO), %ymm0
-# else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-# endif
-	vmulpd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		-12 * SIZE(BO), %ymm1
-	vmulpd  	%ymm0 ,%ymm2  , %ymm11
-	vmovups		 -8 * SIZE(BO), %ymm2
-	vmulpd  	%ymm0 ,%ymm3  , %ymm15
-	vmovups		 -4 * SIZE(BO), %ymm3
-
-.endm
-
-.macro KERNEL4x12_M1
-	prefetcht0	A_PR1(AO)
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -16 * SIZE(AO), %ymm0
-# else
-	vmovups 	-16 * SIZE(AO), %ymm0
-# endif
-	prefetcht0	B_PR1(BO)
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	prefetcht0	B_PR1+64(BO)
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	prefetcht0	B_PR1+128(BO)
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -15 * SIZE(AO), %ymm0
-# else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -14 * SIZE(AO), %ymm0
-# else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -13 * SIZE(AO), %ymm0
-# else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		-12 * SIZE(BO), %ymm1
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	vmovups		 -8 * SIZE(BO), %ymm2
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
-	vmovups		 -4 * SIZE(BO), %ymm3
-
-.endm
-
-.macro KERNEL4x12_M2
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -12 * SIZE(AO), %ymm0
-# else
-	vmovups 	-12 * SIZE(AO), %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -11 * SIZE(AO), %ymm0
-# else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -10 * SIZE(AO), %ymm0
-# else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-
-	addq		$ 8*SIZE, AO
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -17 * SIZE(AO), %ymm0
-# else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		  0 * SIZE(BO), %ymm1
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	vmovups		  4 * SIZE(BO), %ymm2
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
-	vmovups		  8 * SIZE(BO), %ymm3
-	addq		$ 24*SIZE, BO
-.endm
-
-
-.macro KERNEL4x12_E
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -12 * SIZE(AO), %ymm0
-# else
-	vmovups 	-12 * SIZE(AO), %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -11 * SIZE(AO), %ymm0
-# else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -10 * SIZE(AO), %ymm0
-# else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-
-	addq		$ 8*SIZE, AO
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -17 * SIZE(AO), %ymm0
-# else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
-	addq		$ 12*SIZE, BO
-.endm
-
-.macro KERNEL4x12_SUB
-	vmovups		-12 * SIZE(BO), %ymm1
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -16 * SIZE(AO), %ymm0
-# else
-	vmovups 	-16 * SIZE(AO), %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vmovups		 -8 * SIZE(BO), %ymm2
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-	vmovups		 -4 * SIZE(BO), %ymm3
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -15 * SIZE(AO), %ymm0
-# else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-	addq		$ 12*SIZE, BO
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -14 * SIZE(AO), %ymm0
-# else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-	addq		$ 4*SIZE, AO
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
-# if defined BROADCASTKERNEL
-        vbroadcastsd    -17 * SIZE(AO), %ymm0
-# else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-# endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
-
-.endm
-
-
-.macro SAVE4x12
-
-        prefetcht0      BUFFER1
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-	vmulpd	%ymm0 , %ymm6 , %ymm6
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-        prefetcht0      64 + BUFFER1
-	vmulpd	%ymm0 , %ymm8 , %ymm8
-	vmulpd	%ymm0 , %ymm9 , %ymm9
-	vmulpd	%ymm0 , %ymm10, %ymm10
-	vmulpd	%ymm0 , %ymm11, %ymm11
-#if B_PR1 > 32
-        prefetcht0      128 + BUFFER1
-#endif
-	vmulpd	%ymm0 , %ymm12, %ymm12
-	vmulpd	%ymm0 , %ymm13, %ymm13
-	vmulpd	%ymm0 , %ymm14, %ymm14
-	vmulpd	%ymm0 , %ymm15, %ymm15
-#if B_PR1 > 96
-        prefetcht0      192 + BUFFER1
-#endif
-
-#if defined BROADCASTKERNEL
-        vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
-        vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
-        vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
-        vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
-#else
-	vpermilpd $ 0x05 , %ymm5, %ymm5
-	vpermilpd $ 0x05 , %ymm7, %ymm7
-#endif
-
-#if B_PR1 > 160
-        prefetcht0      256 + BUFFER1
-#endif
-
-#if defined BROADCASTKERNEL
-        vunpcklpd %ymm1, %ymm0, %ymm4
-        vunpckhpd %ymm1, %ymm0, %ymm5
-        vunpcklpd %ymm3, %ymm2, %ymm6
-        vunpckhpd %ymm3, %ymm2, %ymm7
-#else
-	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
-	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
-	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
-	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
-#endif
-
-#if B_PR1 > 224
-        prefetcht0      320 + BUFFER1
-#endif
-
-#ifndef BROADCASTKERNEL
-	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
-#endif
-
-#if B_PR1 > 288
-        prefetcht0      384 + BUFFER1
-#endif
-
-#ifndef BROADCASTKERNEL
-	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
-	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
-	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
-	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-#endif
-
-#if B_PR1 > 352
-        prefetcht0      448 + BUFFER1
-#endif
-        leaq    (CO1, LDC, 2), %rax     
-	
-#if B_PR1 > 416
-        prefetcht0      512 + BUFFER1
-#endif
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1), %ymm4, %ymm4
-	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
-	vaddpd 	               (%rax), %ymm6, %ymm6
-	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 ,  	(CO1, LDC)
-	vmovups	%ymm6 ,  	(%rax)
-	vmovups	%ymm7 ,  	(%rax, LDC)
-
-	prefetcht1	56(CO1)
-	prefetcht1	56(CO1,LDC)
-	prefetcht1	56(%rax)
-	prefetcht1	56(%rax,LDC)
-
-#if defined BROADCASTKERNEL
-        vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
-        vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
-        vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
-        vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
-        vunpcklpd %ymm1, %ymm0, %ymm4
-        vunpckhpd %ymm1, %ymm0, %ymm5
-        vunpcklpd %ymm3, %ymm2, %ymm6
-        vunpckhpd %ymm3, %ymm2, %ymm7
-#else
-	vpermilpd $ 0x05 , %ymm9, %ymm9
-	vpermilpd $ 0x05 , %ymm11, %ymm11
-
-	vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0
-	vblendpd $ 0x05, %ymm9, %ymm8, %ymm1
-	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
-	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
-
-	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
-
-	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
-	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
-	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
-	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-#endif
-
-	leaq	(%rax, LDC, 2), %rax
-	leaq	(%rax, LDC, 2), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (%rax), %ymm4, %ymm4
-	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
-	vaddpd 	                (%rbp), %ymm6, %ymm6
-	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(%rax)
-	vmovups	%ymm5 ,  	(%rax, LDC)
-	vmovups	%ymm6 ,  	(%rbp)
-	vmovups	%ymm7 ,  	(%rbp, LDC)
-
-	prefetcht1	56(%rax)
-	prefetcht1	56(%rax,LDC)
-	prefetcht1	56(%rbp)
-	prefetcht1	56(%rbp,LDC)
-
-#if defined BROADCASTKERNEL
-        vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0
-        vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1
-        vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2
-        vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3
-        vunpcklpd %ymm1, %ymm0, %ymm4
-        vunpckhpd %ymm1, %ymm0, %ymm5
-        vunpcklpd %ymm3, %ymm2, %ymm6
-        vunpckhpd %ymm3, %ymm2, %ymm7
-#else
-	vpermilpd $ 0x05 , %ymm13, %ymm13
-	vpermilpd $ 0x05 , %ymm15, %ymm15
-
-	vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
-	vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
-	vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
-	vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
-
-	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
-
-	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
-	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
-	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
-	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-#endif
-
-	leaq	(%rax, LDC, 4), %rax
-	leaq	(%rbp, LDC, 4), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (%rax), %ymm4, %ymm4
-	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
-	vaddpd 	                (%rbp), %ymm6, %ymm6
-	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(%rax)
-	vmovups	%ymm5 ,  	(%rax, LDC)
-	vmovups	%ymm6 ,  	(%rbp)
-	vmovups	%ymm7 ,  	(%rbp, LDC)
-
-	prefetcht1	56(%rax)
-	prefetcht1	56(%rax,LDC)
-	prefetcht1	56(%rbp)
-	prefetcht1	56(%rbp,LDC)
-
-	addq	$ 4*SIZE, CO1
-.endm
-
-/******************************************************************************************/
-
-.macro INIT2x12
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-	vxorpd		%xmm8 , %xmm8 , %xmm8
-	vxorpd		%xmm9 , %xmm9 , %xmm9
-	vxorpd		%xmm10, %xmm10, %xmm10
-	vxorpd		%xmm11, %xmm11, %xmm11
-	vxorpd		%xmm12, %xmm12, %xmm12
-	vxorpd		%xmm13, %xmm13, %xmm13
-	vxorpd		%xmm14, %xmm14, %xmm14
-	vxorpd		%xmm15, %xmm15, %xmm15
-
-.endm
-
-.macro KERNEL2x12_SUB
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vmovddup	-12 * SIZE(BO), %xmm1
-	vmovddup	-11 * SIZE(BO), %xmm2
-	vmovddup	-10 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
-	vmovddup	 -9 * SIZE(BO), %xmm1
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
-	vmovddup	 -8 * SIZE(BO), %xmm2
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
-	vmovddup	 -7 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm7
-	vmovddup	 -6 * SIZE(BO), %xmm1
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm8
-	vmovddup	 -5 * SIZE(BO), %xmm2
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm9
-	vmovddup	 -4 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm10
-	vmovddup	 -3 * SIZE(BO), %xmm1
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm11
-	vmovddup	 -2 * SIZE(BO), %xmm2
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm12
-	vmovddup	 -1 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm13
-	addq		$ 12*SIZE, BO
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm14
-	addq		$ 2*SIZE, AO
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm15
-
-.endm
-
-.macro SAVE2x12
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-	vmulpd	%xmm0 , %xmm5 , %xmm5
-	vmulpd	%xmm0 , %xmm6 , %xmm6
-	vmulpd	%xmm0 , %xmm7 , %xmm7
-
-	vmulpd	%xmm0 , %xmm8 , %xmm8
-	vmulpd	%xmm0 , %xmm9 , %xmm9
-	vmulpd	%xmm0 , %xmm10, %xmm10
-	vmulpd	%xmm0 , %xmm11, %xmm11
-
-	vmulpd	%xmm0 , %xmm12, %xmm12
-	vmulpd	%xmm0 , %xmm13, %xmm13
-	vmulpd	%xmm0 , %xmm14, %xmm14
-	vmulpd	%xmm0 , %xmm15, %xmm15
-
-
-        leaq    (CO1, LDC, 2), %rax     
-	
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1), %xmm4, %xmm4
-	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
-	vaddpd 	               (%rax), %xmm6, %xmm6
-	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(%rax)
-	vmovups	%xmm7 ,  	(%rax, LDC)
-
-
-	leaq	(%rax, LDC, 2), %rax
-	leaq	(%rax, LDC, 2), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (%rax), %xmm8 , %xmm4
-	vaddpd 	           (%rax, LDC), %xmm9 , %xmm5
-	vaddpd 	                (%rbp), %xmm10, %xmm6
-	vaddpd 	           (%rbp, LDC), %xmm11, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(%rax)
-	vmovups	%xmm5 ,  	(%rax, LDC)
-	vmovups	%xmm6 ,  	(%rbp)
-	vmovups	%xmm7 ,  	(%rbp, LDC)
-
-
-	leaq	(%rax, LDC, 4), %rax
-	leaq	(%rbp, LDC, 4), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (%rax), %xmm12, %xmm4
-	vaddpd 	           (%rax, LDC), %xmm13, %xmm5
-	vaddpd 	                (%rbp), %xmm14, %xmm6
-	vaddpd 	           (%rbp, LDC), %xmm15, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(%rax)
-	vmovups	%xmm5 ,  	(%rax, LDC)
-	vmovups	%xmm6 ,  	(%rbp)
-	vmovups	%xmm7 ,  	(%rbp, LDC)
-
-	addq	$ 2*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-
-.macro INIT1x12
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-	vxorpd		%xmm8 , %xmm8 , %xmm8
-	vxorpd		%xmm9 , %xmm9 , %xmm9
-	vxorpd		%xmm10, %xmm10, %xmm10
-	vxorpd		%xmm11, %xmm11, %xmm11
-	vxorpd		%xmm12, %xmm12, %xmm12
-	vxorpd		%xmm13, %xmm13, %xmm13
-	vxorpd		%xmm14, %xmm14, %xmm14
-	vxorpd		%xmm15, %xmm15, %xmm15
-
-.endm
-
-.macro KERNEL1x12_SUB
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd	-11 * SIZE(BO), %xmm2
-	vmovsd	-10 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
-	vmovsd	 -9 * SIZE(BO), %xmm1
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
-	vmovsd	 -8 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
-	vmovsd	 -7 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm7
-	vmovsd	 -6 * SIZE(BO), %xmm1
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm8
-	vmovsd	 -5 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm9
-	vmovsd	 -4 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm10
-	vmovsd	 -3 * SIZE(BO), %xmm1
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm11
-	vmovsd	 -2 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm12
-	vmovsd	 -1 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm13
-	addq		$ 12*SIZE, BO
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm14
-	addq		$ 1*SIZE, AO
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm15
-
-.endm
-
-.macro SAVE1x12
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-	vmulsd	%xmm0 , %xmm6 , %xmm6
-	vmulsd	%xmm0 , %xmm7 , %xmm7
-
-	vmulsd	%xmm0 , %xmm8 , %xmm8
-	vmulsd	%xmm0 , %xmm9 , %xmm9
-	vmulsd	%xmm0 , %xmm10, %xmm10
-	vmulsd	%xmm0 , %xmm11, %xmm11
-
-	vmulsd	%xmm0 , %xmm12, %xmm12
-	vmulsd	%xmm0 , %xmm13, %xmm13
-	vmulsd	%xmm0 , %xmm14, %xmm14
-	vmulsd	%xmm0 , %xmm15, %xmm15
-
-
-        leaq    (CO1, LDC, 2), %rax     
-	
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (CO1), %xmm4, %xmm4
-	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
-	vaddsd 	               (%rax), %xmm6, %xmm6
-	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm6 ,  	(%rax)
-	vmovsd	%xmm7 ,  	(%rax, LDC)
-
-
-	leaq	(%rax, LDC, 2), %rax
-	leaq	(%rax, LDC, 2), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (%rax), %xmm8 , %xmm4
-	vaddsd 	           (%rax, LDC), %xmm9 , %xmm5
-	vaddsd 	                (%rbp), %xmm10, %xmm6
-	vaddsd 	           (%rbp, LDC), %xmm11, %xmm7
-
-#endif
-
-	vmovsd	%xmm4 ,  	(%rax)
-	vmovsd	%xmm5 ,  	(%rax, LDC)
-	vmovsd	%xmm6 ,  	(%rbp)
-	vmovsd	%xmm7 ,  	(%rbp, LDC)
-
-
-	leaq	(%rax, LDC, 4), %rax
-	leaq	(%rbp, LDC, 4), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (%rax), %xmm12, %xmm4
-	vaddsd 	           (%rax, LDC), %xmm13, %xmm5
-	vaddsd 	                (%rbp), %xmm14, %xmm6
-	vaddsd 	           (%rbp, LDC), %xmm15, %xmm7
-
-#endif
-
-	vmovsd	%xmm4 ,  	(%rax)
-	vmovsd	%xmm5 ,  	(%rax, LDC)
-	vmovsd	%xmm6 ,  	(%rbp)
-	vmovsd	%xmm7 ,  	(%rbp, LDC)
-
-	addq	$ 1*SIZE, CO1
-.endm
-
-
-
-
-/******************************************************************************************/
-
-
-.macro INIT4x8
-
-	vxorpd		%ymm4 , %ymm4 , %ymm4
-	vxorpd		%ymm5 , %ymm5 , %ymm5
-	vxorpd		%ymm6 , %ymm6 , %ymm6
-	vxorpd		%ymm7 , %ymm7 , %ymm7
-	vxorpd		%ymm8 , %ymm8 , %ymm8
-	vxorpd		%ymm9 , %ymm9 , %ymm9
-	vxorpd		%ymm10, %ymm10, %ymm10
-	vxorpd		%ymm11, %ymm11, %ymm11
-
-.endm
-
-.macro KERNEL4x8_I
-	vmovups		-12 * SIZE(BO), %ymm1
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -16 * SIZE(AO), %ymm0
-#else
-	vmovups 	-16 * SIZE(AO), %ymm0
-#endif
-	vmovups		 -8 * SIZE(BO), %ymm2
-	vmulpd  	%ymm0 ,%ymm1  , %ymm4
-	vmulpd  	%ymm0 ,%ymm2  , %ymm8
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -15 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vmulpd  	%ymm0 ,%ymm1  , %ymm5
-	vmulpd  	%ymm0 ,%ymm2  , %ymm9
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -14 * SIZE(AO), %ymm0
-#else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-#endif
-	vmulpd  	%ymm0 ,%ymm1  , %ymm6
-	vmulpd  	%ymm0 ,%ymm2  , %ymm10
-
-	addq		$  8*SIZE, BO
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -13 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vmulpd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		-12 * SIZE(BO), %ymm1
-	vmulpd  	%ymm0 ,%ymm2  , %ymm11
-	vmovups		 -8 * SIZE(BO), %ymm2
-
-.endm
-
-.macro KERNEL4x8_M1
-	prefetcht0	A_PR1(AO)
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -16 * SIZE(AO), %ymm0
-#else
-	vmovups 	-16 * SIZE(AO), %ymm0
-#endif
-	prefetcht0	B_PR1(BO)
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	prefetcht0	B_PR1+64(BO)
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -15 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -14 * SIZE(AO), %ymm0
-#else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -13 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		-12 * SIZE(BO), %ymm1
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	vmovups		 -8 * SIZE(BO), %ymm2
-
-.endm
-
-.macro KERNEL4x8_M2
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -12 * SIZE(AO), %ymm0
-#else
-	vmovups 	-12 * SIZE(AO), %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -11 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -10 * SIZE(AO), %ymm0
-#else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-
-	addq		$ 8*SIZE, AO
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -17 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		 -4 * SIZE(BO), %ymm1
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	vmovups		  0 * SIZE(BO), %ymm2
-	addq		$ 16*SIZE, BO
-.endm
-
-
-.macro KERNEL4x8_E
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -12 * SIZE(AO), %ymm0
-#else
-	vmovups 	-12 * SIZE(AO), %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -11 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -10 * SIZE(AO), %ymm0
-#else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-
-	addq		$ 8*SIZE, AO
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -17 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-	addq		$  8*SIZE, BO
-.endm
-
-.macro KERNEL4x8_SUB
-	vmovups		-12 * SIZE(BO), %ymm1
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -16 * SIZE(AO), %ymm0
-#else
-	vmovups 	-16 * SIZE(AO), %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-	vmovups		 -8 * SIZE(BO), %ymm2
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -15 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
-	addq		$  8*SIZE, BO
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -14 * SIZE(AO), %ymm0
-#else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
-	addq		$ 4*SIZE, AO
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -17 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
-
-.endm
-
-
-.macro SAVE4x8
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-	vmulpd	%ymm0 , %ymm6 , %ymm6
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-
-	vmulpd	%ymm0 , %ymm8 , %ymm8
-	vmulpd	%ymm0 , %ymm9 , %ymm9
-	vmulpd	%ymm0 , %ymm10, %ymm10
-	vmulpd	%ymm0 , %ymm11, %ymm11
-
-#if defined BROADCASTKERNEL
-        vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
-        vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
-        vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
-        vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
-        vunpcklpd %ymm1, %ymm0, %ymm4
-        vunpckhpd %ymm1, %ymm0, %ymm5
-        vunpcklpd %ymm3, %ymm2, %ymm6
-        vunpckhpd %ymm3, %ymm2, %ymm7
-#else
-	vpermilpd $ 0x05 , %ymm5, %ymm5
-	vpermilpd $ 0x05 , %ymm7, %ymm7
-
-	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
-	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
-	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
-	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
-
-	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
-
-	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
-	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
-	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
-	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-#endif
-
-        leaq    (CO1, LDC, 2), %rax     
-	
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1), %ymm4, %ymm4
-	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
-	vaddpd 	               (%rax), %ymm6, %ymm6
-	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 ,  	(CO1, LDC)
-	vmovups	%ymm6 ,  	(%rax)
-	vmovups	%ymm7 ,  	(%rax, LDC)
-
-	prefetcht0	56(CO1)
-	prefetcht0	56(CO1,LDC)
-	prefetcht0	56(%rax)
-	prefetcht0	56(%rax,LDC)
-
-#if defined BROADCASTKERNEL
-        vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
-        vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
-        vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
-        vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
-        vunpcklpd %ymm1, %ymm0, %ymm4
-        vunpckhpd %ymm1, %ymm0, %ymm5
-        vunpcklpd %ymm3, %ymm2, %ymm6
-        vunpckhpd %ymm3, %ymm2, %ymm7
-#else
-	vpermilpd $ 0x05 , %ymm9 , %ymm9
-	vpermilpd $ 0x05 , %ymm11, %ymm11
-
-	vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
-	vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
-	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
-	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
-
-	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
-
-	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
-	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
-	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
-	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-#endif
-
-	leaq	(%rax, LDC, 2), %rax
-	leaq	(%rax, LDC, 2), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (%rax), %ymm4, %ymm4
-	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
-	vaddpd 	                (%rbp), %ymm6, %ymm6
-	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(%rax)
-	vmovups	%ymm5 ,  	(%rax, LDC)
-	vmovups	%ymm6 ,  	(%rbp)
-	vmovups	%ymm7 ,  	(%rbp, LDC)
-
-	prefetcht0	56(%rax)
-	prefetcht0	56(%rax,LDC)
-	prefetcht0	56(%rbp)
-	prefetcht0	56(%rbp,LDC)
-
-	addq	$ 4*SIZE, CO1
-.endm
-
-/******************************************************************************************/
-
-.macro INIT2x8
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-	vxorpd		%xmm8 , %xmm8 , %xmm8
-	vxorpd		%xmm9 , %xmm9 , %xmm9
-	vxorpd		%xmm10, %xmm10, %xmm10
-	vxorpd		%xmm11, %xmm11, %xmm11
-
-.endm
-
-.macro KERNEL2x8_SUB
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vmovddup	-12 * SIZE(BO), %xmm1
-	vmovddup	-11 * SIZE(BO), %xmm2
-	vmovddup	-10 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
-	vmovddup	 -9 * SIZE(BO), %xmm1
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
-	vmovddup	 -8 * SIZE(BO), %xmm2
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
-	vmovddup	 -7 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm7
-	vmovddup	 -6 * SIZE(BO), %xmm1
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm8
-	vmovddup	 -5 * SIZE(BO), %xmm2
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm9
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm10
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm11
-	addq		$  8*SIZE, BO
-	addq		$ 2*SIZE, AO
-
-.endm
-
-.macro SAVE2x8
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-	vmulpd	%xmm0 , %xmm5 , %xmm5
-	vmulpd	%xmm0 , %xmm6 , %xmm6
-	vmulpd	%xmm0 , %xmm7 , %xmm7
-
-	vmulpd	%xmm0 , %xmm8 , %xmm8
-	vmulpd	%xmm0 , %xmm9 , %xmm9
-	vmulpd	%xmm0 , %xmm10, %xmm10
-	vmulpd	%xmm0 , %xmm11, %xmm11
-
-        leaq    (CO1, LDC, 2), %rax     
-	
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1), %xmm4, %xmm4
-	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
-	vaddpd 	               (%rax), %xmm6, %xmm6
-	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(%rax)
-	vmovups	%xmm7 ,  	(%rax, LDC)
-
-
-	leaq	(%rax, LDC, 2), %rax
-	leaq	(%rax, LDC, 2), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (%rax), %xmm8 , %xmm4
-	vaddpd 	           (%rax, LDC), %xmm9 , %xmm5
-	vaddpd 	                (%rbp), %xmm10, %xmm6
-	vaddpd 	           (%rbp, LDC), %xmm11, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(%rax)
-	vmovups	%xmm5 ,  	(%rax, LDC)
-	vmovups	%xmm6 ,  	(%rbp)
-	vmovups	%xmm7 ,  	(%rbp, LDC)
-
-	addq	$ 2*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-
-.macro INIT1x8
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-	vxorpd		%xmm8 , %xmm8 , %xmm8
-	vxorpd		%xmm9 , %xmm9 , %xmm9
-	vxorpd		%xmm10, %xmm10, %xmm10
-	vxorpd		%xmm11, %xmm11, %xmm11
-
-.endm
-
-.macro KERNEL1x8_SUB
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd	-11 * SIZE(BO), %xmm2
-	vmovsd	-10 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
-	vmovsd	 -9 * SIZE(BO), %xmm1
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
-	vmovsd	 -8 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
-	vmovsd	 -7 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm7
-	vmovsd	 -6 * SIZE(BO), %xmm1
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm8
-	vmovsd	 -5 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm9
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm10
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm11
-	addq		$  8*SIZE, BO
-	addq		$ 1*SIZE, AO
-
-.endm
-
-.macro SAVE1x8
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-	vmulsd	%xmm0 , %xmm6 , %xmm6
-	vmulsd	%xmm0 , %xmm7 , %xmm7
-
-	vmulsd	%xmm0 , %xmm8 , %xmm8
-	vmulsd	%xmm0 , %xmm9 , %xmm9
-	vmulsd	%xmm0 , %xmm10, %xmm10
-	vmulsd	%xmm0 , %xmm11, %xmm11
-
-        leaq    (CO1, LDC, 2), %rax     
-	
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (CO1), %xmm4, %xmm4
-	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
-	vaddsd 	               (%rax), %xmm6, %xmm6
-	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm6 ,  	(%rax)
-	vmovsd	%xmm7 ,  	(%rax, LDC)
-
-
-	leaq	(%rax, LDC, 2), %rax
-	leaq	(%rax, LDC, 2), %rbp
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (%rax), %xmm8 , %xmm4
-	vaddsd 	           (%rax, LDC), %xmm9 , %xmm5
-	vaddsd 	                (%rbp), %xmm10, %xmm6
-	vaddsd 	           (%rbp, LDC), %xmm11, %xmm7
-
-#endif
-
-	vmovsd	%xmm4 ,  	(%rax)
-	vmovsd	%xmm5 ,  	(%rax, LDC)
-	vmovsd	%xmm6 ,  	(%rbp)
-	vmovsd	%xmm7 ,  	(%rbp, LDC)
-
-	addq	$ 1*SIZE, CO1
-.endm
-
-
-
-
-
-/******************************************************************************************/
-
-.macro INIT4x4
-
-	vxorpd		%ymm4 , %ymm4 , %ymm4
-	vxorpd		%ymm5 , %ymm5 , %ymm5
-	vxorpd		%ymm6 , %ymm6 , %ymm6
-	vxorpd		%ymm7 , %ymm7 , %ymm7
-
-.endm
-
-.macro KERNEL4x4_I
-	prefetcht0	A_PR1(AO)
-	vmovups		-12 * SIZE(BO), %ymm1
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -16 * SIZE(AO), %ymm0
-#else
-	vmovups 	-16 * SIZE(AO), %ymm0
-#endif
-	vmulpd  	%ymm0 ,%ymm1  , %ymm4
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -15 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vmulpd  	%ymm0 ,%ymm1  , %ymm5
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -14 * SIZE(AO), %ymm0
-#else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-#endif
-	vmulpd  	%ymm0 ,%ymm1  , %ymm6
-
-	addq		$ 4*SIZE, BO
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -13 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vmulpd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		-12 * SIZE(BO), %ymm1
-
-.endm
-
-.macro KERNEL4x4_M1
-	prefetcht0	A_PR1(AO)
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -16 * SIZE(AO), %ymm0
-#else
-	vmovups 	-16 * SIZE(AO), %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -15 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -14 * SIZE(AO), %ymm0
-#else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -13 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		-12 * SIZE(BO), %ymm1
-
-.endm
-
-.macro KERNEL4x4_M2
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -12 * SIZE(AO), %ymm0
-#else
-	vmovups 	-12 * SIZE(AO), %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -11 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -10 * SIZE(AO), %ymm0
-#else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-
-	addq		$ 8*SIZE, AO
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -17 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	vmovups		 -8 * SIZE(BO), %ymm1
-	addq		$ 8*SIZE, BO
-.endm
-
-
-.macro KERNEL4x4_E
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -12 * SIZE(AO), %ymm0
-#else
-	vmovups 	-12 * SIZE(AO), %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -11 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -10 * SIZE(AO), %ymm0
-#else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-
-	addq		$ 8*SIZE, AO
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -17 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-	addq		$ 4*SIZE, BO
-.endm
-
-.macro KERNEL4x4_SUB
-	vmovups		-12 * SIZE(BO), %ymm1
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -16 * SIZE(AO), %ymm0
-#else
-	vmovups 	-16 * SIZE(AO), %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -15 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
-	addq		$ 4*SIZE, BO
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -14 * SIZE(AO), %ymm0
-#else
-	vpermpd		$ 0x1b, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
-	addq		$ 4*SIZE, AO
-#if defined BROADCASTKERNEL
-        vbroadcastsd    -17 * SIZE(AO), %ymm0
-#else
-	vpermilpd	$ 0x05, %ymm0  , %ymm0
-#endif
-	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
-
-.endm
-
-.macro SAVE4x4
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-	vmulpd	%ymm0 , %ymm7 , %ymm7
-	vmulpd	%ymm0 , %ymm5 , %ymm5
-	vmulpd	%ymm0 , %ymm6 , %ymm6
-
-#if defined BROADCASTKERNEL
-        vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
-        vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
-        vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
-        vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
-        vunpcklpd %ymm1, %ymm0, %ymm4
-        vunpckhpd %ymm1, %ymm0, %ymm5
-        vunpcklpd %ymm3, %ymm2, %ymm6
-        vunpckhpd %ymm3, %ymm2, %ymm7
-#else
-	vpermilpd $ 0x05 , %ymm5, %ymm5
-	vpermilpd $ 0x05 , %ymm7, %ymm7
-
-	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
-	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
-	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
-	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
-
-	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
-	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
-
-	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
-	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
-	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
-	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
-#endif
-
-        leaq    (CO1, LDC, 2), %rax     
-	
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1), %ymm4, %ymm4
-	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
-	vaddpd 	               (%rax), %ymm6, %ymm6
-	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 ,  	(CO1, LDC)
-	vmovups	%ymm6 ,  	(%rax)
-	vmovups	%ymm7 ,  	(%rax, LDC)
-
-	addq	$ 4*SIZE, CO1
-.endm
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT2x4
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-
-.endm
-
-
-.macro KERNEL2x4_SUB
-	vmovddup	-12 * SIZE(BO), %xmm1
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vmovddup	-11 * SIZE(BO), %xmm2
-	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
-	vmovddup	-10 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
-	vmovddup	 -9 * SIZE(BO), %xmm8
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
-	addq		$ 4*SIZE, BO
-	vfmadd231pd  	%xmm0 ,%xmm8  , %xmm7
-	addq		$ 2*SIZE, AO
-
-.endm
-
-
-.macro SAVE2x4
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-	vmulpd	%xmm0 , %xmm5 , %xmm5
-	vmulpd	%xmm0 , %xmm6 , %xmm6
-	vmulpd	%xmm0 , %xmm7 , %xmm7
-
-        leaq    (CO1, LDC, 2), %rax     
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1), %xmm4, %xmm4
-	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
-	vaddpd 	               (%rax), %xmm6, %xmm6
-	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(%rax)
-	vmovups	%xmm7 ,  	(%rax, LDC)
-
-	addq	$ 2*SIZE, CO1
-.endm
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT1x4
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-
-.endm
-
-
-.macro KERNEL1x4_SUB
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	vmovsd	-11 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
-	vmovsd	-10 * SIZE(BO), %xmm3
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
-	vmovsd	 -9 * SIZE(BO), %xmm8
-	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
-	addq		$ 4*SIZE, BO
-	vfmadd231sd  	%xmm0 ,%xmm8  , %xmm7
-	addq		$ 1*SIZE, AO
-
-.endm
-
-
-.macro SAVE1x4
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-	vmulsd	%xmm0 , %xmm6 , %xmm6
-	vmulsd	%xmm0 , %xmm7 , %xmm7
-
-        leaq    (CO1, LDC, 2), %rax     
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (CO1), %xmm4, %xmm4
-	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
-	vaddsd 	               (%rax), %xmm6, %xmm6
-	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm6 ,  	(%rax)
-	vmovsd	%xmm7 ,  	(%rax, LDC)
-
-	addq	$ 1*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT4x2
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-	vxorpd		%xmm7 , %xmm7 , %xmm7
-
-.endm
-
-
-.macro KERNEL4x2_SUB
-	vmovddup	-12 * SIZE(BO), %xmm2
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vmovups 	-14 * SIZE(AO), %xmm1
-	vmovddup	-11 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
-	vfmadd231pd  	%xmm1 ,%xmm2  , %xmm5
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
-	vfmadd231pd  	%xmm1 ,%xmm3  , %xmm7
-	addq		$ 2*SIZE, BO
-	addq		$ 4*SIZE, AO
-
-.endm
-
-
-.macro SAVE4x2
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-	vmulpd	%xmm0 , %xmm5 , %xmm5
-	vmulpd	%xmm0 , %xmm6 , %xmm6
-	vmulpd	%xmm0 , %xmm7 , %xmm7
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1)     , %xmm4, %xmm4
-	vaddpd 	        2 * SIZE(CO1)     , %xmm5, %xmm5
-	vaddpd 	                (CO1, LDC), %xmm6, %xmm6
-	vaddpd 	        2 * SIZE(CO1, LDC), %xmm7, %xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 , 2 * SIZE(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-	vmovups	%xmm7 , 2 * SIZE(CO1, LDC)
-
-	addq	$ 4*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT2x2
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm6 , %xmm6 , %xmm6
-
-.endm
-
-
-.macro KERNEL2x2_SUB
-	vmovddup	-12 * SIZE(BO), %xmm2
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vmovddup	-11 * SIZE(BO), %xmm3
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
-	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
-	addq		$ 2*SIZE, BO
-	addq		$ 2*SIZE, AO
-
-.endm
-
-
-.macro SAVE2x2
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-	vmulpd	%xmm0 , %xmm6 , %xmm6
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1)     , %xmm4, %xmm4
-	vaddpd 	                (CO1, LDC), %xmm6, %xmm6
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-
-	addq	$ 2*SIZE, CO1
-.endm
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT1x2
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-	vxorpd		%xmm5 , %xmm5 , %xmm5
-
-.endm
-
-
-.macro KERNEL1x2_SUB
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	vmovsd	-11 * SIZE(BO), %xmm2
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
-	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
-	addq		$ 2*SIZE, BO
-	addq		$ 1*SIZE, AO
-
-.endm
-
-
-.macro SAVE1x2
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-	vmulsd	%xmm0 , %xmm5 , %xmm5
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (CO1), %xmm4, %xmm4
-	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-
-	addq	$ 1*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT4x1
-
-	vxorpd		%ymm4 , %ymm4 , %ymm4
-	vxorpd		%ymm5 , %ymm5 , %ymm5
-	vxorpd		%ymm6 , %ymm6 , %ymm6
-	vxorpd		%ymm7 , %ymm7 , %ymm7
-
-.endm
-
-
-.macro KERNEL4x1
-
-	vbroadcastsd	-12 * SIZE(BO), %ymm0
-	vbroadcastsd	-11 * SIZE(BO), %ymm1
-	vbroadcastsd	-10 * SIZE(BO), %ymm2
-	vbroadcastsd	-9  * SIZE(BO), %ymm3
-
-	vfmadd231pd  	-16 * SIZE(AO) ,%ymm0  , %ymm4
-	vfmadd231pd  	-12 * SIZE(AO) ,%ymm1  , %ymm5
-
-	vbroadcastsd	-8  * SIZE(BO), %ymm0
-	vbroadcastsd	-7  * SIZE(BO), %ymm1
-
-	vfmadd231pd  	-8  * SIZE(AO) ,%ymm2  , %ymm6
-	vfmadd231pd  	-4  * SIZE(AO) ,%ymm3  , %ymm7
-
-	vbroadcastsd	-6  * SIZE(BO), %ymm2
-	vbroadcastsd	-5  * SIZE(BO), %ymm3
-
-	vfmadd231pd  	 0  * SIZE(AO) ,%ymm0  , %ymm4
-	vfmadd231pd  	 4  * SIZE(AO) ,%ymm1  , %ymm5
-	vfmadd231pd  	 8  * SIZE(AO) ,%ymm2  , %ymm6
-	vfmadd231pd  	 12 * SIZE(AO) ,%ymm3  , %ymm7
-
-	addq		$ 8 *SIZE, BO
-	addq		$ 32*SIZE, AO
-
-.endm
-
-
-.macro KERNEL4x1_SUB
-	vbroadcastsd	-12 * SIZE(BO), %ymm2
-	vmovups 	-16 * SIZE(AO), %ymm0
-	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm4
-	addq		$ 1*SIZE, BO
-	addq		$ 4*SIZE, AO
-
-.endm
-
-
-.macro SAVE4x1
-
-	vbroadcastsd	ALPHA, %ymm0
-
-	vaddpd	%ymm4,%ymm5, %ymm4 
-	vaddpd	%ymm6,%ymm7, %ymm6 
-	vaddpd	%ymm4,%ymm6, %ymm4 
-
-	vmulpd	%ymm0 , %ymm4 , %ymm4
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1)     , %ymm4, %ymm4
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-
-	addq	$ 4*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT2x1
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-
-.endm
-
-
-.macro KERNEL2x1_SUB
-	vmovddup	-12 * SIZE(BO), %xmm2
-	vmovups 	-16 * SIZE(AO), %xmm0
-	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
-	addq		$ 1*SIZE, BO
-	addq		$ 2*SIZE, AO
-
-.endm
-
-
-.macro SAVE2x1
-
-	vmovddup	ALPHA, %xmm0
-
-	vmulpd	%xmm0 , %xmm4 , %xmm4
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddpd 	                (CO1)     , %xmm4, %xmm4
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-
-	addq	$ 2*SIZE, CO1
-.endm
-
-
-/******************************************************************************************/
-/******************************************************************************************/
-
-.macro INIT1x1
-
-	vxorpd		%xmm4 , %xmm4 , %xmm4
-
-.endm
-
-
-.macro KERNEL1x1_SUB
-	vmovsd	-12 * SIZE(BO), %xmm1
-	vmovsd 	-16 * SIZE(AO), %xmm0
-	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
-	addq		$ 1*SIZE, BO
-	addq		$ 1*SIZE, AO
-
-.endm
-
-
-.macro SAVE1x1
-
-	vmovsd	ALPHA, %xmm0
-
-	vmulsd	%xmm0 , %xmm4 , %xmm4
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddsd 	                (CO1), %xmm4, %xmm4
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-
-	addq	$ 1*SIZE, CO1
-.endm
-
-
-.macro PREFETCHT0_C
-        prefetcht0 (CO1)
-        prefetcht0 24(CO1)
-        prefetcht0 (CO1,LDC,4)
-        prefetcht0 24(CO1,LDC,4)
-        prefetcht0 (CO1,LDC,8)
-        prefetcht0 24(CO1,LDC,8)
-.endm
-/*******************************************************************************************/
-
-#if !defined(TRMMKERNEL)
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-
-	vmovups	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$ 0, OLD_M
-	je	.L999
-
-	cmpq	$ 0, OLD_N
-	je	.L999
-
-	cmpq	$ 0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $24,  %rdi
-        divq    %rdi                     //    N / 24
-        movq    %rax, Ndiv12             //    N / 24
-        movq    %rdx, Nmod12             //    N % 24
-
-
-	movq	Ndiv12,  J
-	cmpq	$ 0, J
-	je	.L8_0
-	ALIGN_4
-
-.L12_01:
-        // copy to sub buffer
-        movq    K, %rax
-        salq    $3,%rax                 // K * 8 ; read 8 values from BO1
-        movq    B, BO1
-        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
-	movq	BO2 , B			
-
-        leaq    BUFFER1, BO             // first buffer to BO
-        movq    K, %rax
-
-        ALIGN_4
-
-.L12_02b:
-
-	vmovups	0 * SIZE(BO1), %ymm1
-	vmovups	4 * SIZE(BO1), %ymm2
-	vmovups	0 * SIZE(BO2), %ymm3
-	vmovups	%ymm1, 0 * SIZE(BO)
-	vmovups	%ymm2, 4 * SIZE(BO)
-	vmovups	%ymm3, 8 * SIZE(BO)
-	addq	$ 8*SIZE,BO1
-	addq	$ 8*SIZE,BO2
-	addq	$ 12*SIZE,BO
-	decq	%rax
-	jnz	.L12_02b
-
-.L12_03c:
-
-
-.L12_10:
-	movq	C, CO1
-	leaq	(C, LDC, 8), C		 
-	leaq	(C, LDC, 4), C		// c += 12 * ldc
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L12_20
-
-	ALIGN_4
-
-.L12_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-        movq    K, %rax
-
-	sarq $3, %rax			//  K / 8
-	cmpq $2, %rax
-
-	jl	.L12_13
-
-
-	KERNEL4x12_I
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	subq $2, %rax
-	je	.L12_12a
-
-	ALIGN_5
-.L12_12:
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	dec	%rax
-	jne	.L12_12
-	
-.L12_12a:
-        prefetcht0 ALPHA
-        PREFETCHT0_C
-        addq  LDC,CO1
-	KERNEL4x12_M1
-        PREFETCHT0_C
-        leaq  (CO1,LDC,2),CO1
-	KERNEL4x12_M2
-        PREFETCHT0_C
-        subq  LDC,CO1
-	KERNEL4x12_M1
-        PREFETCHT0_C
-        subq  LDC,CO1
-        subq  LDC,CO1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_E
-
-	jmp .L12_16
-
-
-.L12_13:
-
-	test $1, %rax
-	jz .L12_14
-
-	KERNEL4x12_I
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_E
-
-	jmp .L12_16
-
-
-.L12_14:
-
-	INIT4x12
-
-
-.L12_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L12_19
-
-	ALIGN_4
-
-.L12_17:
-
-	KERNEL4x12_SUB
-
-	dec	%rax
-	jne	.L12_17
-	ALIGN_4
-
-
-.L12_19:
-
-	SAVE4x12
-
-        /* here for the prefetch of next b source block */
-	/* the increment should be proportional to GEMM_Q/GEMM_P */
-
-        salq    $3, K
-#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
-        prefetcht2 32(B)
-        prefetcht2 32(B, K, 8)
-        addq    $64, B /* increment */
-#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
-        prefetcht2 32(B)
-        prefetcht2 32(B, K, 8)
-        prefetcht2 96(B)
-        prefetcht2 96(B, K, 8)
-        addq    $128, B /* increment */
-#endif
-        sarq    $3, K
-
-	decq	I			# i --
-	jne	.L12_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-
-        /* recover the original value of pointer B after prefetch */
-        movq    M, I
-        sarq    $2, I
-#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
-        salq    $6, I
-#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
-        salq    $7, I
-#endif
-        subq    I, B
-
-.L12_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L12_100			// to next 16 lines of N
-
-
-.L12_30:
-	testq	$2, M		
-	jz	.L12_40
-
-	ALIGN_4
-
-.L12_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT2x12
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L12_36
-	ALIGN_4
-
-.L12_32:
-
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-
-	dec %rax
-	jne	.L12_32
-	ALIGN_4
-
-.L12_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L12_39
-
-	ALIGN_4
-
-.L12_37:
-
-	KERNEL2x12_SUB
-
-	dec %rax
-	jne	.L12_37
-	ALIGN_4
-
-
-.L12_39:
-
-	SAVE2x12
-
-	ALIGN_4
-
-.L12_40:
-	testq	$1, M		
-	jz	.L12_100		// to next 3 lines of N
-
-	ALIGN_4
-
-.L12_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT1x12
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L12_46
-
-	ALIGN_4
-
-.L12_42:
-
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-
-
-	dec %rax
-	jne	.L12_42
-	ALIGN_4
-
-.L12_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L12_49
-
-	ALIGN_4
-
-.L12_47:
-
-	KERNEL1x12_SUB
-
-	dec	%rax
-	jne	.L12_47
-	ALIGN_4
-
-
-.L12_49:
-
-	SAVE1x12
-
-	ALIGN_4
-	
-.L12_100:
-
-
-
-/**************************************************************************************************/
-
-.L13_01:
-        // copy to sub buffer
-        movq    K, %rax
-        salq    $3,%rax                 // K * 8 ; read 8 values
-        movq    B, BO2
-        leaq    (B,%rax, SIZE), BO3     // next offset to BO2
-        leaq    (BO3,%rax, SIZE), B     // next offset to B
-
-
-        leaq    BUFFER1, BO             // first buffer to BO
-        movq    K, %rax
-
-        ALIGN_4
-
-
-.L13_02b:
-
-	vmovups	4 * SIZE(BO2), %ymm1
-	vmovups	0 * SIZE(BO3), %ymm2
-	vmovups	4 * SIZE(BO3), %ymm3
-	vmovups	%ymm1, 0 * SIZE(BO)
-	vmovups	%ymm2, 4 * SIZE(BO)
-	vmovups	%ymm3, 8 * SIZE(BO)
-	addq	$ 8*SIZE,BO2
-	addq	$ 8*SIZE,BO3
-	addq	$ 12*SIZE,BO
-	decq	%rax
-	jnz	.L13_02b
-
-
-
-.L13_10:
-	movq	C, CO1
-	leaq	(C, LDC, 8), C		 
-	leaq	(C, LDC, 4), C		// c += 12 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L13_20
-
-	ALIGN_4
-
-.L13_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-        movq    K, %rax
-
-	sarq $3, %rax			//  K / 8
-	cmpq $2, %rax
-
-	jl	.L13_13
-
-
-	KERNEL4x12_I
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	subq $2, %rax
-	je	.L13_12a
-
-	ALIGN_5
-.L13_12:
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	dec	%rax
-	jne	.L13_12
-
-.L13_12a:
-        prefetcht0 ALPHA
-        PREFETCHT0_C
-        addq  LDC,CO1
-	KERNEL4x12_M1
-        PREFETCHT0_C
-        leaq  (CO1,LDC,2),CO1
-	KERNEL4x12_M2
-        PREFETCHT0_C
-        subq  LDC,CO1
-	KERNEL4x12_M1
-        PREFETCHT0_C
-        subq  LDC,CO1
-        subq  LDC,CO1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_E
-
-	jmp .L13_16
-
-.L13_13:
-
-	test $1, %rax
-	jz .L13_14
-
-	KERNEL4x12_I
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-
-	KERNEL4x12_M1
-	KERNEL4x12_M2
-	KERNEL4x12_M1
-	KERNEL4x12_E
-
-	jmp .L13_16
-
-
-.L13_14:
-
-	INIT4x12
-
-
-.L13_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L13_19
-
-	ALIGN_4
-
-.L13_17:
-
-	KERNEL4x12_SUB
-
-	dec	%rax
-	jne	.L13_17
-	ALIGN_4
-
-
-.L13_19:
-
-	SAVE4x12
-
-        /* here for the prefetch of next b source block */
-	/* the increment should be proportional to GEMM_Q/GEMM_P */
-
-        salq    $3, K
-#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
-        prefetcht2 (B)
-        prefetcht2 (B, K, 8)
-        addq    $64, B /* increment */
-#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
-        prefetcht2 (B)
-        prefetcht2 (B, K, 8)
-        prefetcht2 64(B)
-        prefetcht2 64(B, K, 8)
-        addq    $128, B /* increment */
-#endif
-        sarq    $3, K
-
-	decq	I			# i --
-	jne	.L13_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-        /* recover the original value of pointer B */
-        movq    M, I
-        sarq    $2, I
-#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
-        salq    $6, I
-#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
-        salq    $7, I
-#endif
-        subq    I, B
-
-.L13_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L13_100			// to next 16 lines of N
-
-
-.L13_30:
-	testq	$2, M		
-	jz	.L13_40
-
-	ALIGN_4
-
-.L13_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT2x12
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L13_36
-	ALIGN_4
-
-.L13_32:
-
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-	KERNEL2x12_SUB
-
-	dec %rax
-	jne	.L13_32
-	ALIGN_4
-
-.L13_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L13_39
-
-	ALIGN_4
-
-.L13_37:
-
-	KERNEL2x12_SUB
-
-	dec %rax
-	jne	.L13_37
-	ALIGN_4
-
-
-.L13_39:
-
-	SAVE2x12
-
-	ALIGN_4
-
-.L13_40:
-	testq	$1, M		
-	jz	.L13_100		// to next 3 lines of N
-
-	ALIGN_4
-
-.L13_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT1x12
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L13_46
-
-	ALIGN_4
-
-.L13_42:
-
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-	KERNEL1x12_SUB
-
-
-	dec %rax
-	jne	.L13_42
-	ALIGN_4
-
-.L13_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L13_49
-
-	ALIGN_4
-
-.L13_47:
-
-	KERNEL1x12_SUB
-
-	dec	%rax
-	jne	.L13_47
-	ALIGN_4
-
-
-.L13_49:
-
-	SAVE1x12
-
-	ALIGN_4
-	
-.L13_100:
-
-	decq	J			// j --
-	jg	.L12_01
-
-
-
-
-/**************************************************************************************************/
-
-.L8_0:
-
-	cmpq	$ 0, Nmod12		// N % 12 == 0
-	je	.L999
-
-	movq	Nmod12, J		
-	sarq	$3, J			// j = j / 8
-	je	.L4_0
-
-.L8_10:
-	movq	C, CO1
-	leaq	(C, LDC, 8), C		// c += 4 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L8_20
-
-	ALIGN_4
-
-.L8_11:
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-
-        movq    K, %rax
-
-	sarq	$3, %rax			//  K / 8
-	cmpq    $2, %rax
-	jl	.L8_13
-
-
-	KERNEL4x8_I
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	subq $2, %rax
-	je	.L8_12a
-
-	ALIGN_5
-
-.L8_12:
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	dec	%rax
-	jne	.L8_12
-
-.L8_12a:
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_E
-
-	jmp .L8_16
-
-
-.L8_13:
-
-	test $1, %rax
-	jz .L8_14
-
-	KERNEL4x8_I
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_E
-
-	jmp .L8_16
-
-
-.L8_14:
-
-	INIT4x8
-
-
-.L8_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L8_19
-
-	ALIGN_4
-
-.L8_17:
-
-	KERNEL4x8_SUB
-
-	dec	%rax
-	jne	.L8_17
-	ALIGN_4
-
-
-.L8_19:
-
-	SAVE4x8
-
-	decq	I			# i --
-	jg	.L8_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L8_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L8_100			// to next 16 lines of N
-
-
-.L8_30:
-	testq	$2, M		
-	jz	.L8_40
-
-	ALIGN_4
-
-.L8_31:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT2x8
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L8_36
-	ALIGN_4
-
-.L8_32:
-
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-
-	dec %rax
-	jne	.L8_32
-	ALIGN_4
-
-.L8_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L8_39
-
-	ALIGN_4
-
-.L8_37:
-
-	KERNEL2x8_SUB
-
-	dec %rax
-	jne	.L8_37
-
-
-.L8_39:
-
-	SAVE2x8
-
-.L8_40:
-	testq	$1, M		
-	jz	.L8_100		// to next 3 lines of N
-
-	ALIGN_4
-
-.L8_41:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT1x8
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L8_46
-
-	ALIGN_4
-
-.L8_42:
-
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-
-	dec %rax
-	jne	.L8_42
-	ALIGN_4
-
-.L8_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L8_49
-
-	ALIGN_4
-
-.L8_47:
-
-	KERNEL1x8_SUB
-
-	dec	%rax
-	jne	.L8_47
-	ALIGN_4
-
-
-.L8_49:
-
-	SAVE1x8
-
-	ALIGN_4
-	
-.L8_100:
-
-	movq	K, %rax
-	salq	$3, %rax		// * 8
-	leaq	(B , %rax, SIZE), B
-	decq	J			// j --
-	jg	.L8_10
-
-
-
-/**************************************************************************************************/
-
-.L4_0:
-
-	cmpq	$ 0, Nmod12		// N % 12 == 0
-	je	.L999
-
-	movq	Nmod12, J		
-	testq   $4, J			// j = j / 4
-	je	.L2_0
-
-.L4_10:
-	movq	C, CO1
-	leaq	(C, LDC, 4), C		// c += 4 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L4_20
-
-	ALIGN_4
-
-.L4_11:
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-
-        movq    K, %rax
-
-	sarq	$3, %rax			//  K / 8
-	cmpq    $2, %rax
-	jl	.L4_13
-
-
-	KERNEL4x4_I
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	subq $2, %rax
-	je	.L4_12a
-
-	ALIGN_5
-
-.L4_12:
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	dec	%rax
-	jne	.L4_12
-
-.L4_12a:
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_E
-
-	jmp .L4_16
-
-
-.L4_13:
-
-	test $1, %rax
-	jz .L4_14
-
-	KERNEL4x4_I
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_E
-
-	jmp .L4_16
-
-
-.L4_14:
-
-	INIT4x4
-
-
-.L4_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_19
-
-	ALIGN_4
-
-.L4_17:
-
-	KERNEL4x4_SUB
-
-	dec	%rax
-	jne	.L4_17
-	ALIGN_4
-
-
-.L4_19:
-
-	SAVE4x4
-
-	decq	I			# i --
-	jg	.L4_11
-
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L4_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L4_100			// to next 16 lines of N
-
-
-.L4_30:
-	testq	$2, M		
-	jz	.L4_40
-
-	ALIGN_4
-
-.L4_31:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT2x4
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L4_36
-	ALIGN_4
-
-.L4_32:
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	dec %rax
-	jne	.L4_32
-	ALIGN_4
-
-.L4_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_39
-
-	ALIGN_4
-
-.L4_37:
-
-	KERNEL2x4_SUB
-
-	dec %rax
-	jne	.L4_37
-
-
-.L4_39:
-
-	SAVE2x4
-
-.L4_40:
-	testq	$1, M		
-	jz	.L4_100		// to next 3 lines of N
-
-	ALIGN_4
-
-.L4_41:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT1x4
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L4_46
-
-	ALIGN_4
-
-.L4_42:
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	dec %rax
-	jne	.L4_42
-	ALIGN_4
-
-.L4_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_49
-
-	ALIGN_4
-
-.L4_47:
-
-	KERNEL1x4_SUB
-
-	dec	%rax
-	jne	.L4_47
-	ALIGN_4
-
-
-.L4_49:
-
-	SAVE1x4
-
-	ALIGN_4
-	
-.L4_100:
-
-	movq	K, %rax
-	salq	$2, %rax		// * 4
-	leaq	(B , %rax, SIZE), B
-
-
-
-
-/***************************************************************************************************************/
-
-.L2_0:
-
-	movq	Nmod12, J		
-	testq	$2, J
-	je	.L1_0
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-
-	INIT4x2
-
-        movq    K, %rax
-	sarq $3, %rax			//  K / 8
-
-	je	.L2_16
-
-	ALIGN_5
-
-.L2_12:
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	dec	%rax
-	jne	.L2_12
-
-
-.L2_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL4x2_SUB
-
-	dec	%rax
-	jne	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE4x2
-
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L2_100			// to next 16 lines of N
-
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT2x2
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L2_36
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	dec %rax
-	jne	.L2_32
-
-.L2_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	dec %rax
-	jne	.L2_37
-
-
-.L2_39:
-
-	SAVE2x2
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_100		// to next 3 lines of N
-
-.L2_41:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT1x2
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L2_46
-
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	dec %rax
-	jne	.L2_42
-
-.L2_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	dec	%rax
-	jne	.L2_47
-
-.L2_49:
-
-	SAVE1x2
-
-.L2_100:
-
-	movq	K, %rax
-	salq	$1, %rax		// * 2
-	leaq	(B , %rax, SIZE), B
-
-/***************************************************************************************************************/
-
-.L1_0:
-
-	movq	Nmod12, J		
-	testq	$1, J
-	je	.L999
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-
-	INIT4x1
-
-        movq    K, %rax
-
-	sarq	$3, %rax			//  K / 8
-	je	.L1_16
-
-	ALIGN_5
-
-.L1_12:
-
-	KERNEL4x1
-
-	dec	%rax
-	jne	.L1_12
-
-
-.L1_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL4x1_SUB
-
-	dec	%rax
-	jne	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE4x1
-
-	decq	I			# i --
-	jg	.L1_11
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L1_100	
-
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT2x1
-
-        movq    K, %rax
-
-	sarq	$3, %rax
-	je	.L1_36
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-
-	dec %rax
-	jne	.L1_32
-
-.L1_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	dec %rax
-	jne	.L1_37
-
-.L1_39:
-
-	SAVE2x1
-
-.L1_40:
-	testq	$1, M		
-	jz	.L1_100		// to next 3 lines of N
-
-
-.L1_41:
-        movq    B, BO             // first buffer to BO
-        addq    $12 * SIZE, BO
-
-	INIT1x1
-
-        movq    K, %rax
-
-	sarq	$3,%rax
-	je	.L1_46
-
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	dec %rax
-	jne	.L1_42
-
-.L1_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	dec	%rax
-	jne	.L1_47
-
-
-.L1_49:
-
-	SAVE1x1
-
-.L1_100:
-
-
-
-
-.L999:
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#else
-/*************************************************************************************
-* TRMM Kernel
-*************************************************************************************/
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	vmovups	%xmm6,   64(%rsp)
-	vmovups	%xmm7,   80(%rsp)
-	vmovups	%xmm8,   96(%rsp)
-	vmovups	%xmm9,  112(%rsp)
-	vmovups	%xmm10, 128(%rsp)
-	vmovups	%xmm11, 144(%rsp)
-	vmovups	%xmm12, 160(%rsp)
-	vmovups	%xmm13, 176(%rsp)
-	vmovups	%xmm14, 192(%rsp)
-	vmovups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovups	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	vmovsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$ 0, OLD_M
-	je	.L999
-
-	cmpq	$ 0, OLD_N
-	je	.L999
-
-	cmpq	$ 0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $8,  %rdi
-        divq    %rdi                     //    N / 8
-        movq    %rax, Ndiv12             //    N / 8
-        movq    %rdx, Nmod12             //    N % 8
-
-#ifdef TRMMKERNEL
-        vmovsd  %xmm12, OFFSET
-        vmovsd  %xmm12, KK
-#ifndef LEFT
-        negq    KK
-#endif  
-#endif
-
-/*************************************************************************************************/
-.L8_0:
-	movq	Ndiv12,  J
-	cmpq	$ 0, J
-	je	.L4_0
-	ALIGN_4
-
-.L8_10:
-	movq	C, CO1
-	leaq	(C, LDC, 8), C		// c += 8 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L8_20
-
-	ALIGN_4
-
-.L8_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,8), BO		// add number of values in B
-	leaq	(AO,%rax,4), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $8, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	sarq	$3, %rax			//  K / 8
-	cmpq    $2, %rax
-	jl	.L8_13
-
-
-	KERNEL4x8_I
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	subq $2, %rax
-	je	.L8_12a
-
-	ALIGN_5
-
-.L8_12:
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	dec	%rax
-	jne	.L8_12
-
-.L8_12a:
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_E
-
-	jmp .L8_16
-
-
-.L8_13:
-
-	test $1, %rax
-	jz .L8_14
-
-	KERNEL4x8_I
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-
-	KERNEL4x8_M1
-	KERNEL4x8_M2
-	KERNEL4x8_M1
-	KERNEL4x8_E
-
-	jmp .L8_16
-
-
-.L8_14:
-
-	INIT4x8
-
-
-.L8_16:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L8_19
-
-	ALIGN_4
-
-.L8_17:
-
-	KERNEL4x8_SUB
-
-	dec	%rax
-	jne	.L8_17
-	ALIGN_4
-
-
-.L8_19:
-
-	SAVE4x8
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 8), BO		// number of values in B
-        leaq    (AO, %rax, 4), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK				// number of values in A
-#endif
-
-	decq	I			# i --
-	jg	.L8_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L8_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L8_100			// to next 16 lines of N
-
-
-.L8_30:
-	testq	$2, M		
-	jz	.L8_40
-
-	ALIGN_4
-
-.L8_31:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,8), BO		// add number of values in B
-	leaq	(AO,%rax,2), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $8, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT2x8
-
-	sarq	$3, %rax
-	je	.L8_36
-	ALIGN_4
-
-.L8_32:
-
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-	KERNEL2x8_SUB
-
-	dec %rax
-	jne	.L8_32
-	ALIGN_4
-
-.L8_36:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L8_39
-
-	ALIGN_4
-
-.L8_37:
-
-	KERNEL2x8_SUB
-
-	dec %rax
-	jne	.L8_37
-
-
-.L8_39:
-
-	SAVE2x8
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 8), BO		// number of values in B
-        leaq    (AO, %rax, 2), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK				// number of values in A
-#endif
-
-
-.L8_40:
-	testq	$1, M		
-	jz	.L8_100		// to next 3 lines of N
-
-	ALIGN_4
-
-.L8_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,8), BO		// add number of values in B
-	leaq	(AO,%rax,1), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $8, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT1x8
-
-	sarq	$3,%rax
-	je	.L8_46
-
-	ALIGN_4
-
-.L8_42:
-
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-	KERNEL1x8_SUB
-
-	dec %rax
-	jne	.L8_42
-	ALIGN_4
-
-.L8_46:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L8_49
-
-	ALIGN_4
-
-.L8_47:
-
-	KERNEL1x8_SUB
-
-	dec	%rax
-	jne	.L8_47
-	ALIGN_4
-
-
-.L8_49:
-
-	SAVE1x8
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 8), BO		// number of values in B
-        leaq    (AO, %rax, 1), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK				// number of values in A
-#endif
-
-.L8_100:
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $8, KK				// number of values in B
-#endif
-
-
-	decq	J			// j --
-	jg	.L8_10
-
-
-
-
-
-/*************************************************************************************************/
-.L4_0:
-	movq	Nmod12, J		
-	testq	$4, J
-	je	.L2_0
-	ALIGN_4
-
-.L4_10:
-	movq	C, CO1
-	leaq	(C, LDC, 4), C		// c += 4 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L4_20
-
-	ALIGN_4
-
-.L4_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,4), BO		// add number of values in B
-	leaq	(AO,%rax,4), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	sarq	$3, %rax			//  K / 8
-	cmpq    $2, %rax
-	jl	.L4_13
-
-
-	KERNEL4x4_I
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	subq $2, %rax
-	je	.L4_12a
-
-	ALIGN_5
-
-.L4_12:
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	dec	%rax
-	jne	.L4_12
-
-.L4_12a:
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_E
-
-	jmp .L4_16
-
-
-.L4_13:
-
-	test $1, %rax
-	jz .L4_14
-
-	KERNEL4x4_I
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-
-	KERNEL4x4_M1
-	KERNEL4x4_M2
-	KERNEL4x4_M1
-	KERNEL4x4_E
-
-	jmp .L4_16
-
-
-.L4_14:
-
-	INIT4x4
-
-
-.L4_16:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_19
-
-	ALIGN_4
-
-.L4_17:
-
-	KERNEL4x4_SUB
-
-	dec	%rax
-	jne	.L4_17
-	ALIGN_4
-
-
-.L4_19:
-
-	SAVE4x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 4), BO		// number of values in B
-        leaq    (AO, %rax, 4), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK				// number of values in A
-#endif
-
-	decq	I			# i --
-	jg	.L4_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L4_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L4_100			// to next 16 lines of N
-
-
-.L4_30:
-	testq	$2, M		
-	jz	.L4_40
-
-	ALIGN_4
-
-.L4_31:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,4), BO		// add number of values in B
-	leaq	(AO,%rax,2), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT2x4
-
-	sarq	$3, %rax
-	je	.L4_36
-	ALIGN_4
-
-.L4_32:
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	dec %rax
-	jne	.L4_32
-	ALIGN_4
-
-.L4_36:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_39
-
-	ALIGN_4
-
-.L4_37:
-
-	KERNEL2x4_SUB
-
-	dec %rax
-	jne	.L4_37
-
-
-.L4_39:
-
-	SAVE2x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 4), BO		// number of values in B
-        leaq    (AO, %rax, 2), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK				// number of values in A
-#endif
-
-
-.L4_40:
-	testq	$1, M		
-	jz	.L4_100		// to next 3 lines of N
-
-	ALIGN_4
-
-.L4_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,4), BO		// add number of values in B
-	leaq	(AO,%rax,1), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT1x4
-
-	sarq	$3,%rax
-	je	.L4_46
-
-	ALIGN_4
-
-.L4_42:
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	dec %rax
-	jne	.L4_42
-	ALIGN_4
-
-.L4_46:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_49
-
-	ALIGN_4
-
-.L4_47:
-
-	KERNEL1x4_SUB
-
-	dec	%rax
-	jne	.L4_47
-	ALIGN_4
-
-
-.L4_49:
-
-	SAVE1x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 4), BO		// number of values in B
-        leaq    (AO, %rax, 1), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK				// number of values in A
-#endif
-
-.L4_100:
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $4, KK				// number of values in B
-#endif
-
-
-	movq	K, %rax
-	salq	$2, %rax		// * 4
-	leaq	(B , %rax, SIZE), B
-
-
-
-
-/***************************************************************************************************************/
-
-.L2_0:
-
-	movq	Nmod12, J		
-	testq	$2, J
-	je	.L1_0
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,2), BO		// add number of values in B
-	leaq	(AO,%rax,4), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT4x2
-
-	sarq $3, %rax			//  K / 8
-
-	je	.L2_16
-
-	ALIGN_5
-
-.L2_12:
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	dec	%rax
-	jne	.L2_12
-
-
-.L2_16:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL4x2_SUB
-
-	dec	%rax
-	jne	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 2), BO		// number of values in B
-        leaq    (AO, %rax, 4), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK				// number of values in A
-#endif
-
-
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L2_100			// to next 16 lines of N
-
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,2), BO		// add number of values in B
-	leaq	(AO,%rax,2), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT2x2
-
-	sarq	$3, %rax
-	je	.L2_36
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	dec %rax
-	jne	.L2_32
-
-.L2_36:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	dec %rax
-	jne	.L2_37
-
-
-.L2_39:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax + SIZE
-        leaq    (BO, %rax, 2), BO		// number of values in B
-        leaq    (AO, %rax, 2), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK				// number of values in A
-#endif
-
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_100		// to next 3 lines of N
-
-.L2_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,2), BO		// add number of values in B
-	leaq	(AO,%rax,1), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT1x2
-
-	sarq	$3,%rax
-	je	.L2_46
-
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	dec %rax
-	jne	.L2_42
-
-.L2_46:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	dec	%rax
-	jne	.L2_47
-
-.L2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax * SIZE
-        leaq    (BO, %rax, 2), BO		// number of values in B
-        leaq    (AO, %rax, 1), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK				// number of values in A
-#endif
-
-
-.L2_100:
-
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK				// number of values in B
-#endif
-
-	movq	K, %rax
-	salq	$1, %rax		// * 2
-	leaq	(B , %rax, SIZE), B
-
-/***************************************************************************************************************/
-
-.L1_0:
-
-	movq	Nmod12, J		
-	testq	$1, J
-	je	.L999
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$2, I			// i = m / 4
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,1), BO		// add number of values in B
-	leaq	(AO,%rax,4), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT4x1
-
-	sarq	$3, %rax			//  K / 8
-	je	.L1_16
-
-	ALIGN_5
-
-.L1_12:
-
-	KERNEL4x1
-
-	dec	%rax
-	jne	.L1_12
-
-
-.L1_16:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL4x1_SUB
-
-	dec	%rax
-	jne	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE4x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax * SIZE
-        leaq    (BO, %rax, 1), BO		// number of values in B
-        leaq    (AO, %rax, 4), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK				// number of values in A
-#endif
-
-
-	decq	I			# i --
-	jg	.L1_11
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$3, M
-	jz	.L1_100	
-
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,1), BO		// add number of values in B
-	leaq	(AO,%rax,2), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT2x1
-
-	sarq	$3, %rax
-	je	.L1_36
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-
-	dec %rax
-	jne	.L1_32
-
-.L1_36:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	dec %rax
-	jne	.L1_37
-
-.L1_39:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax * SIZE
-        leaq    (BO, %rax, 1), BO		// number of values in B
-        leaq    (AO, %rax, 2), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK				// number of values in A
-#endif
-
-
-.L1_40:
-	testq	$1, M		
-	jz	.L1_100		// to next 3 lines of N
-
-
-.L1_41:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-#else
-        movq    B, BO        
-        addq    $12 * SIZE, BO
-        movq    KK, %rax
-	salq	$3, %rax		// rax * SIZE
-	leaq	(BO,%rax,1), BO		// add number of values in B
-	leaq	(AO,%rax,1), AO		// add number of values in A
-#endif
-
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	INIT1x1
-
-	sarq	$3,%rax
-	je	.L1_46
-
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	dec %rax
-	jne	.L1_42
-
-.L1_46:
-        movq    KKK, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	dec	%rax
-	jne	.L1_47
-
-
-.L1_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	salq	$3, %rax			// rax * SIZE
-        leaq    (BO, %rax, 1), BO		// number of values in B
-        leaq    (AO, %rax, 1), AO		// number of values in A
-#endif
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK				// number of values in A
-#endif
-
-
-
-.L1_100:
-
-
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $1, KK				// number of values in B
-#endif
-
-
-
-.L999:
-
-	vzeroupper
-
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	vmovups	 64(%rsp), %xmm6
-	vmovups	 80(%rsp), %xmm7
-	vmovups	 96(%rsp), %xmm8
-	vmovups	112(%rsp), %xmm9
-	vmovups	128(%rsp), %xmm10
-	vmovups	144(%rsp), %xmm11
-	vmovups	160(%rsp), %xmm12
-	vmovups	176(%rsp), %xmm13
-	vmovups	192(%rsp), %xmm14
-	vmovups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-
-
-
-#endif
+/*********************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+#define BO3	%rbp
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+#define L_BUFFER_SIZE 256*8*12+4096
+
+#else
+
+#define STACKSIZE 256
+#define L_BUFFER_SIZE 128*8*12+512
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+
+#define Ndiv12	 24(%rsp)
+#define Nmod12	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 4(%rsp);\
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+#define	A_PR1	512
+#define	B_PR1	160
+#define BROADCASTKERNEL
+
+/*******************************************************************************************
+* Macro definitions
+*******************************************************************************************/
+
+.macro INIT4x12
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+	vxorpd		%ymm8 , %ymm8 , %ymm8
+	vxorpd		%ymm9 , %ymm9 , %ymm9
+	vxorpd		%ymm10, %ymm10, %ymm10
+	vxorpd		%ymm11, %ymm11, %ymm11
+	vxorpd		%ymm12, %ymm12, %ymm12
+	vxorpd		%ymm13, %ymm13, %ymm13
+	vxorpd		%ymm14, %ymm14, %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+
+.endm
+
+.macro KERNEL4x12_I
+	prefetcht0	A_PR1(AO)
+	vmovups		-12 * SIZE(BO), %ymm1
+	prefetcht0	B_PR1(BO)
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+# else
+	vmovups 	-16 * SIZE(AO), %ymm0
+# endif
+	prefetcht0	B_PR1+64(BO)
+	vmovups		 -8 * SIZE(BO), %ymm2
+	prefetcht0	B_PR1+128(BO)
+	vmovups		 -4 * SIZE(BO), %ymm3
+	vmulpd  	%ymm0 ,%ymm1  , %ymm4
+	prefetcht0	B_PR1+192(BO)
+	vmulpd  	%ymm0 ,%ymm2  , %ymm8
+	vmulpd  	%ymm0 ,%ymm3  , %ymm12
+	prefetcht0	B_PR1+256(BO)
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+# else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
+	vmulpd  	%ymm0 ,%ymm1  , %ymm5
+	vmulpd  	%ymm0 ,%ymm2  , %ymm9
+	vmulpd  	%ymm0 ,%ymm3  , %ymm13
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+# else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+# endif
+	vmulpd  	%ymm0 ,%ymm1  , %ymm6
+	vmulpd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 12*SIZE, BO
+	vmulpd  	%ymm0 ,%ymm3  , %ymm14
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+# else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
+	vmulpd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmulpd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vmulpd  	%ymm0 ,%ymm3  , %ymm15
+	vmovups		 -4 * SIZE(BO), %ymm3
+
+.endm
+
+.macro KERNEL4x12_M1
+	prefetcht0	A_PR1(AO)
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+# else
+	vmovups 	-16 * SIZE(AO), %ymm0
+# endif
+	prefetcht0	B_PR1(BO)
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	prefetcht0	B_PR1+64(BO)
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	prefetcht0	B_PR1+128(BO)
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+# else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+# else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+# else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+	vmovups		 -4 * SIZE(BO), %ymm3
+
+.endm
+
+.macro KERNEL4x12_M2
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+# else
+	vmovups 	-12 * SIZE(AO), %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+# else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+# else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 8*SIZE, AO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+# else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		  0 * SIZE(BO), %ymm1
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		  4 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+	vmovups		  8 * SIZE(BO), %ymm3
+	addq		$ 24*SIZE, BO
+.endm
+
+
+.macro KERNEL4x12_E
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+# else
+	vmovups 	-12 * SIZE(AO), %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+# else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+# else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 8*SIZE, AO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+# else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+	addq		$ 12*SIZE, BO
+.endm
+
+.macro KERNEL4x12_SUB
+	vmovups		-12 * SIZE(BO), %ymm1
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+# else
+	vmovups 	-16 * SIZE(AO), %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vmovups		 -4 * SIZE(BO), %ymm3
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+# else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	addq		$ 12*SIZE, BO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+# else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+	addq		$ 4*SIZE, AO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+# if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+# else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+# endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+
+.endm
+
+
+.macro SAVE4x12
+
+        prefetcht0      BUFFER1
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+        prefetcht0      64 + BUFFER1
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+	vmulpd	%ymm0 , %ymm9 , %ymm9
+	vmulpd	%ymm0 , %ymm10, %ymm10
+	vmulpd	%ymm0 , %ymm11, %ymm11
+#if B_PR1 > 32
+        prefetcht0      128 + BUFFER1
+#endif
+	vmulpd	%ymm0 , %ymm12, %ymm12
+	vmulpd	%ymm0 , %ymm13, %ymm13
+	vmulpd	%ymm0 , %ymm14, %ymm14
+	vmulpd	%ymm0 , %ymm15, %ymm15
+#if B_PR1 > 96
+        prefetcht0      192 + BUFFER1
+#endif
+
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+        vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+        vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+        vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+#else
+	vpermilpd $ 0x05 , %ymm5, %ymm5
+	vpermilpd $ 0x05 , %ymm7, %ymm7
+#endif
+
+#if B_PR1 > 160
+        prefetcht0      256 + BUFFER1
+#endif
+
+#if defined BROADCASTKERNEL
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
+	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
+	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
+	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+#endif
+
+#if B_PR1 > 224
+        prefetcht0      320 + BUFFER1
+#endif
+
+#ifndef BROADCASTKERNEL
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
+#endif
+
+#if B_PR1 > 288
+        prefetcht0      384 + BUFFER1
+#endif
+
+#ifndef BROADCASTKERNEL
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
+
+#if B_PR1 > 352
+        prefetcht0      448 + BUFFER1
+#endif
+        leaq    (CO1, LDC, 2), %rax     
+	
+#if B_PR1 > 416
+        prefetcht0      512 + BUFFER1
+#endif
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %ymm4, %ymm4
+	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
+	vaddpd 	               (%rax), %ymm6, %ymm6
+	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm6 ,  	(%rax)
+	vmovups	%ymm7 ,  	(%rax, LDC)
+
+	prefetcht1	56(CO1)
+	prefetcht1	56(CO1,LDC)
+	prefetcht1	56(%rax)
+	prefetcht1	56(%rax,LDC)
+
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
+        vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
+        vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
+        vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+	vpermilpd $ 0x05 , %ymm9, %ymm9
+	vpermilpd $ 0x05 , %ymm11, %ymm11
+
+	vblendpd $ 0x0a, %ymm9, %ymm8, %ymm0
+	vblendpd $ 0x05, %ymm9, %ymm8, %ymm1
+	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
+	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
+
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %ymm4, %ymm4
+	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
+	vaddpd 	                (%rbp), %ymm6, %ymm6
+	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(%rax)
+	vmovups	%ymm5 ,  	(%rax, LDC)
+	vmovups	%ymm6 ,  	(%rbp)
+	vmovups	%ymm7 ,  	(%rbp, LDC)
+
+	prefetcht1	56(%rax)
+	prefetcht1	56(%rax,LDC)
+	prefetcht1	56(%rbp)
+	prefetcht1	56(%rbp,LDC)
+
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm14, %ymm12 , %ymm0
+        vperm2f128 $ 0x20 , %ymm15, %ymm13 , %ymm1
+        vperm2f128 $ 0x31 , %ymm14, %ymm12 , %ymm2
+        vperm2f128 $ 0x31 , %ymm15, %ymm13 , %ymm3
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+	vpermilpd $ 0x05 , %ymm13, %ymm13
+	vpermilpd $ 0x05 , %ymm15, %ymm15
+
+	vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
+	vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
+	vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
+	vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
+
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
+
+	leaq	(%rax, LDC, 4), %rax
+	leaq	(%rbp, LDC, 4), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %ymm4, %ymm4
+	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
+	vaddpd 	                (%rbp), %ymm6, %ymm6
+	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(%rax)
+	vmovups	%ymm5 ,  	(%rax, LDC)
+	vmovups	%ymm6 ,  	(%rbp)
+	vmovups	%ymm7 ,  	(%rbp, LDC)
+
+	prefetcht1	56(%rax)
+	prefetcht1	56(%rax,LDC)
+	prefetcht1	56(%rbp)
+	prefetcht1	56(%rbp,LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+
+.macro INIT2x12
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+	vxorpd		%xmm8 , %xmm8 , %xmm8
+	vxorpd		%xmm9 , %xmm9 , %xmm9
+	vxorpd		%xmm10, %xmm10, %xmm10
+	vxorpd		%xmm11, %xmm11, %xmm11
+	vxorpd		%xmm12, %xmm12, %xmm12
+	vxorpd		%xmm13, %xmm13, %xmm13
+	vxorpd		%xmm14, %xmm14, %xmm14
+	vxorpd		%xmm15, %xmm15, %xmm15
+
+.endm
+
+.macro KERNEL2x12_SUB
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmovddup	-11 * SIZE(BO), %xmm2
+	vmovddup	-10 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
+	vmovddup	 -9 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
+	vmovddup	 -8 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	vmovddup	 -7 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm7
+	vmovddup	 -6 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm8
+	vmovddup	 -5 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm9
+	vmovddup	 -4 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm10
+	vmovddup	 -3 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm11
+	vmovddup	 -2 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm12
+	vmovddup	 -1 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm13
+	addq		$ 12*SIZE, BO
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm14
+	addq		$ 2*SIZE, AO
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm15
+
+.endm
+
+.macro SAVE2x12
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+	vmulpd	%xmm0 , %xmm8 , %xmm8
+	vmulpd	%xmm0 , %xmm9 , %xmm9
+	vmulpd	%xmm0 , %xmm10, %xmm10
+	vmulpd	%xmm0 , %xmm11, %xmm11
+
+	vmulpd	%xmm0 , %xmm12, %xmm12
+	vmulpd	%xmm0 , %xmm13, %xmm13
+	vmulpd	%xmm0 , %xmm14, %xmm14
+	vmulpd	%xmm0 , %xmm15, %xmm15
+
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %xmm4, %xmm4
+	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddpd 	               (%rax), %xmm6, %xmm6
+	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(%rax)
+	vmovups	%xmm7 ,  	(%rax, LDC)
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %xmm8 , %xmm4
+	vaddpd 	           (%rax, LDC), %xmm9 , %xmm5
+	vaddpd 	                (%rbp), %xmm10, %xmm6
+	vaddpd 	           (%rbp, LDC), %xmm11, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(%rax)
+	vmovups	%xmm5 ,  	(%rax, LDC)
+	vmovups	%xmm6 ,  	(%rbp)
+	vmovups	%xmm7 ,  	(%rbp, LDC)
+
+
+	leaq	(%rax, LDC, 4), %rax
+	leaq	(%rbp, LDC, 4), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %xmm12, %xmm4
+	vaddpd 	           (%rax, LDC), %xmm13, %xmm5
+	vaddpd 	                (%rbp), %xmm14, %xmm6
+	vaddpd 	           (%rbp, LDC), %xmm15, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(%rax)
+	vmovups	%xmm5 ,  	(%rax, LDC)
+	vmovups	%xmm6 ,  	(%rbp)
+	vmovups	%xmm7 ,  	(%rbp, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+
+.macro INIT1x12
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+	vxorpd		%xmm8 , %xmm8 , %xmm8
+	vxorpd		%xmm9 , %xmm9 , %xmm9
+	vxorpd		%xmm10, %xmm10, %xmm10
+	vxorpd		%xmm11, %xmm11, %xmm11
+	vxorpd		%xmm12, %xmm12, %xmm12
+	vxorpd		%xmm13, %xmm13, %xmm13
+	vxorpd		%xmm14, %xmm14, %xmm14
+	vxorpd		%xmm15, %xmm15, %xmm15
+
+.endm
+
+.macro KERNEL1x12_SUB
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vmovsd	-10 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vmovsd	 -9 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	vmovsd	 -8 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
+	vmovsd	 -7 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm7
+	vmovsd	 -6 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm8
+	vmovsd	 -5 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm9
+	vmovsd	 -4 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm10
+	vmovsd	 -3 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm11
+	vmovsd	 -2 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm12
+	vmovsd	 -1 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm13
+	addq		$ 12*SIZE, BO
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm14
+	addq		$ 1*SIZE, AO
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm15
+
+.endm
+
+.macro SAVE1x12
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+	vmulsd	%xmm0 , %xmm7 , %xmm7
+
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+	vmulsd	%xmm0 , %xmm9 , %xmm9
+	vmulsd	%xmm0 , %xmm10, %xmm10
+	vmulsd	%xmm0 , %xmm11, %xmm11
+
+	vmulsd	%xmm0 , %xmm12, %xmm12
+	vmulsd	%xmm0 , %xmm13, %xmm13
+	vmulsd	%xmm0 , %xmm14, %xmm14
+	vmulsd	%xmm0 , %xmm15, %xmm15
+
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddsd 	               (%rax), %xmm6, %xmm6
+	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(%rax)
+	vmovsd	%xmm7 ,  	(%rax, LDC)
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (%rax), %xmm8 , %xmm4
+	vaddsd 	           (%rax, LDC), %xmm9 , %xmm5
+	vaddsd 	                (%rbp), %xmm10, %xmm6
+	vaddsd 	           (%rbp, LDC), %xmm11, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(%rax)
+	vmovsd	%xmm5 ,  	(%rax, LDC)
+	vmovsd	%xmm6 ,  	(%rbp)
+	vmovsd	%xmm7 ,  	(%rbp, LDC)
+
+
+	leaq	(%rax, LDC, 4), %rax
+	leaq	(%rbp, LDC, 4), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (%rax), %xmm12, %xmm4
+	vaddsd 	           (%rax, LDC), %xmm13, %xmm5
+	vaddsd 	                (%rbp), %xmm14, %xmm6
+	vaddsd 	           (%rbp, LDC), %xmm15, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(%rax)
+	vmovsd	%xmm5 ,  	(%rax, LDC)
+	vmovsd	%xmm6 ,  	(%rbp)
+	vmovsd	%xmm7 ,  	(%rbp, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+
+
+/******************************************************************************************/
+
+
+.macro INIT4x8
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+	vxorpd		%ymm8 , %ymm8 , %ymm8
+	vxorpd		%ymm9 , %ymm9 , %ymm9
+	vxorpd		%ymm10, %ymm10, %ymm10
+	vxorpd		%ymm11, %ymm11, %ymm11
+
+.endm
+
+.macro KERNEL4x8_I
+	vmovups		-12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
+	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vmulpd  	%ymm0 ,%ymm1  , %ymm4
+	vmulpd  	%ymm0 ,%ymm2  , %ymm8
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vmulpd  	%ymm0 ,%ymm1  , %ymm5
+	vmulpd  	%ymm0 ,%ymm2  , %ymm9
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
+	vmulpd  	%ymm0 ,%ymm1  , %ymm6
+	vmulpd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$  8*SIZE, BO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vmulpd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmulpd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		 -8 * SIZE(BO), %ymm2
+
+.endm
+
+.macro KERNEL4x8_M1
+	prefetcht0	A_PR1(AO)
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
+	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
+	prefetcht0	B_PR1(BO)
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	prefetcht0	B_PR1+64(BO)
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		 -8 * SIZE(BO), %ymm2
+
+.endm
+
+.macro KERNEL4x8_M2
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+#else
+	vmovups 	-12 * SIZE(AO), %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+#else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 8*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		 -4 * SIZE(BO), %ymm1
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		  0 * SIZE(BO), %ymm2
+	addq		$ 16*SIZE, BO
+.endm
+
+
+.macro KERNEL4x8_E
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+#else
+	vmovups 	-12 * SIZE(AO), %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+#else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 8*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	addq		$  8*SIZE, BO
+.endm
+
+.macro KERNEL4x8_SUB
+	vmovups		-12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
+	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	addq		$  8*SIZE, BO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+	addq		$ 4*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+
+.endm
+
+
+.macro SAVE4x8
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+	vmulpd	%ymm0 , %ymm9 , %ymm9
+	vmulpd	%ymm0 , %ymm10, %ymm10
+	vmulpd	%ymm0 , %ymm11, %ymm11
+
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+        vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+        vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+        vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+	vpermilpd $ 0x05 , %ymm5, %ymm5
+	vpermilpd $ 0x05 , %ymm7, %ymm7
+
+	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
+	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
+	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
+	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %ymm4, %ymm4
+	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
+	vaddpd 	               (%rax), %ymm6, %ymm6
+	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm6 ,  	(%rax)
+	vmovups	%ymm7 ,  	(%rax, LDC)
+
+	prefetcht0	56(CO1)
+	prefetcht0	56(CO1,LDC)
+	prefetcht0	56(%rax)
+	prefetcht0	56(%rax,LDC)
+
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm10, %ymm8 , %ymm0
+        vperm2f128 $ 0x20 , %ymm11, %ymm9 , %ymm1
+        vperm2f128 $ 0x31 , %ymm10, %ymm8 , %ymm2
+        vperm2f128 $ 0x31 , %ymm11, %ymm9 , %ymm3
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+	vpermilpd $ 0x05 , %ymm9 , %ymm9
+	vpermilpd $ 0x05 , %ymm11, %ymm11
+
+	vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
+	vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
+	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
+	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
+
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %ymm4, %ymm4
+	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
+	vaddpd 	                (%rbp), %ymm6, %ymm6
+	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(%rax)
+	vmovups	%ymm5 ,  	(%rax, LDC)
+	vmovups	%ymm6 ,  	(%rbp)
+	vmovups	%ymm7 ,  	(%rbp, LDC)
+
+	prefetcht0	56(%rax)
+	prefetcht0	56(%rax,LDC)
+	prefetcht0	56(%rbp)
+	prefetcht0	56(%rbp,LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+
+.macro INIT2x8
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+	vxorpd		%xmm8 , %xmm8 , %xmm8
+	vxorpd		%xmm9 , %xmm9 , %xmm9
+	vxorpd		%xmm10, %xmm10, %xmm10
+	vxorpd		%xmm11, %xmm11, %xmm11
+
+.endm
+
+.macro KERNEL2x8_SUB
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmovddup	-11 * SIZE(BO), %xmm2
+	vmovddup	-10 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
+	vmovddup	 -9 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
+	vmovddup	 -8 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	vmovddup	 -7 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm7
+	vmovddup	 -6 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm8
+	vmovddup	 -5 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm9
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm10
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm11
+	addq		$  8*SIZE, BO
+	addq		$ 2*SIZE, AO
+
+.endm
+
+.macro SAVE2x8
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+	vmulpd	%xmm0 , %xmm8 , %xmm8
+	vmulpd	%xmm0 , %xmm9 , %xmm9
+	vmulpd	%xmm0 , %xmm10, %xmm10
+	vmulpd	%xmm0 , %xmm11, %xmm11
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %xmm4, %xmm4
+	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddpd 	               (%rax), %xmm6, %xmm6
+	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(%rax)
+	vmovups	%xmm7 ,  	(%rax, LDC)
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %xmm8 , %xmm4
+	vaddpd 	           (%rax, LDC), %xmm9 , %xmm5
+	vaddpd 	                (%rbp), %xmm10, %xmm6
+	vaddpd 	           (%rbp, LDC), %xmm11, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(%rax)
+	vmovups	%xmm5 ,  	(%rax, LDC)
+	vmovups	%xmm6 ,  	(%rbp)
+	vmovups	%xmm7 ,  	(%rbp, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+
+.macro INIT1x8
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+	vxorpd		%xmm8 , %xmm8 , %xmm8
+	vxorpd		%xmm9 , %xmm9 , %xmm9
+	vxorpd		%xmm10, %xmm10, %xmm10
+	vxorpd		%xmm11, %xmm11, %xmm11
+
+.endm
+
+.macro KERNEL1x8_SUB
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vmovsd	-10 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vmovsd	 -9 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	vmovsd	 -8 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
+	vmovsd	 -7 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm7
+	vmovsd	 -6 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm8
+	vmovsd	 -5 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm9
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm10
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm11
+	addq		$  8*SIZE, BO
+	addq		$ 1*SIZE, AO
+
+.endm
+
+.macro SAVE1x8
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+	vmulsd	%xmm0 , %xmm7 , %xmm7
+
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+	vmulsd	%xmm0 , %xmm9 , %xmm9
+	vmulsd	%xmm0 , %xmm10, %xmm10
+	vmulsd	%xmm0 , %xmm11, %xmm11
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddsd 	               (%rax), %xmm6, %xmm6
+	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(%rax)
+	vmovsd	%xmm7 ,  	(%rax, LDC)
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (%rax), %xmm8 , %xmm4
+	vaddsd 	           (%rax, LDC), %xmm9 , %xmm5
+	vaddsd 	                (%rbp), %xmm10, %xmm6
+	vaddsd 	           (%rbp, LDC), %xmm11, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(%rax)
+	vmovsd	%xmm5 ,  	(%rax, LDC)
+	vmovsd	%xmm6 ,  	(%rbp)
+	vmovsd	%xmm7 ,  	(%rbp, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+
+
+
+/******************************************************************************************/
+
+.macro INIT4x4
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+
+.endm
+
+.macro KERNEL4x4_I
+	prefetcht0	A_PR1(AO)
+	vmovups		-12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
+	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
+	vmulpd  	%ymm0 ,%ymm1  , %ymm4
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vmulpd  	%ymm0 ,%ymm1  , %ymm5
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
+	vmulpd  	%ymm0 ,%ymm1  , %ymm6
+
+	addq		$ 4*SIZE, BO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vmulpd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+
+.endm
+
+.macro KERNEL4x4_M1
+	prefetcht0	A_PR1(AO)
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
+	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -13 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+
+.endm
+
+.macro KERNEL4x4_M2
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+#else
+	vmovups 	-12 * SIZE(AO), %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+#else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+
+	addq		$ 8*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		 -8 * SIZE(BO), %ymm1
+	addq		$ 8*SIZE, BO
+.endm
+
+
+.macro KERNEL4x4_E
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -12 * SIZE(AO), %ymm0
+#else
+	vmovups 	-12 * SIZE(AO), %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -11 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -10 * SIZE(AO), %ymm0
+#else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+
+	addq		$ 8*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	addq		$ 4*SIZE, BO
+.endm
+
+.macro KERNEL4x4_SUB
+	vmovups		-12 * SIZE(BO), %ymm1
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -16 * SIZE(AO), %ymm0
+#else
+	vmovups 	-16 * SIZE(AO), %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -15 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	addq		$ 4*SIZE, BO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -14 * SIZE(AO), %ymm0
+#else
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	addq		$ 4*SIZE, AO
+#if defined BROADCASTKERNEL
+        vbroadcastsd    -17 * SIZE(AO), %ymm0
+#else
+	vpermilpd	$ 0x05, %ymm0  , %ymm0
+#endif
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+
+.endm
+
+.macro SAVE4x4
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+
+#if defined BROADCASTKERNEL
+        vperm2f128 $ 0x20 , %ymm6, %ymm4 , %ymm0
+        vperm2f128 $ 0x20 , %ymm7, %ymm5 , %ymm1
+        vperm2f128 $ 0x31 , %ymm6, %ymm4 , %ymm2
+        vperm2f128 $ 0x31 , %ymm7, %ymm5 , %ymm3
+        vunpcklpd %ymm1, %ymm0, %ymm4
+        vunpckhpd %ymm1, %ymm0, %ymm5
+        vunpcklpd %ymm3, %ymm2, %ymm6
+        vunpckhpd %ymm3, %ymm2, %ymm7
+#else
+	vpermilpd $ 0x05 , %ymm5, %ymm5
+	vpermilpd $ 0x05 , %ymm7, %ymm7
+
+	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
+	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
+	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
+	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+
+	vperm2f128 $ 0x01 , %ymm2, %ymm2 , %ymm2
+	vperm2f128 $ 0x01 , %ymm3, %ymm3 , %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+#endif
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %ymm4, %ymm4
+	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
+	vaddpd 	               (%rax), %ymm6, %ymm6
+	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm6 ,  	(%rax)
+	vmovups	%ymm7 ,  	(%rax, LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT2x4
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+
+.endm
+
+
+.macro KERNEL2x4_SUB
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-11 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
+	vmovddup	-10 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
+	vmovddup	 -9 * SIZE(BO), %xmm8
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	addq		$ 4*SIZE, BO
+	vfmadd231pd  	%xmm0 ,%xmm8  , %xmm7
+	addq		$ 2*SIZE, AO
+
+.endm
+
+
+.macro SAVE2x4
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+        leaq    (CO1, LDC, 2), %rax     
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %xmm4, %xmm4
+	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddpd 	               (%rax), %xmm6, %xmm6
+	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(%rax)
+	vmovups	%xmm7 ,  	(%rax, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT1x4
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+
+.endm
+
+
+.macro KERNEL1x4_SUB
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vmovsd	-10 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	vmovsd	 -9 * SIZE(BO), %xmm8
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
+	addq		$ 4*SIZE, BO
+	vfmadd231sd  	%xmm0 ,%xmm8  , %xmm7
+	addq		$ 1*SIZE, AO
+
+.endm
+
+
+.macro SAVE1x4
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+	vmulsd	%xmm0 , %xmm7 , %xmm7
+
+        leaq    (CO1, LDC, 2), %rax     
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddsd 	               (%rax), %xmm6, %xmm6
+	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(%rax)
+	vmovsd	%xmm7 ,  	(%rax, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT4x2
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+
+.endm
+
+
+.macro KERNEL4x2_SUB
+	vmovddup	-12 * SIZE(BO), %xmm2
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovups 	-14 * SIZE(AO), %xmm1
+	vmovddup	-11 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
+	vfmadd231pd  	%xmm1 ,%xmm2  , %xmm5
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	vfmadd231pd  	%xmm1 ,%xmm3  , %xmm7
+	addq		$ 2*SIZE, BO
+	addq		$ 4*SIZE, AO
+
+.endm
+
+
+.macro SAVE4x2
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %xmm4, %xmm4
+	vaddpd 	        2 * SIZE(CO1)     , %xmm5, %xmm5
+	vaddpd 	                (CO1, LDC), %xmm6, %xmm6
+	vaddpd 	        2 * SIZE(CO1, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 , 2 * SIZE(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm7 , 2 * SIZE(CO1, LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT2x2
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+
+.endm
+
+
+.macro KERNEL2x2_SUB
+	vmovddup	-12 * SIZE(BO), %xmm2
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-11 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	addq		$ 2*SIZE, BO
+	addq		$ 2*SIZE, AO
+
+.endm
+
+
+.macro SAVE2x2
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %xmm4, %xmm4
+	vaddpd 	                (CO1, LDC), %xmm6, %xmm6
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT1x2
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+
+.endm
+
+
+.macro KERNEL1x2_SUB
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	addq		$ 2*SIZE, BO
+	addq		$ 1*SIZE, AO
+
+.endm
+
+
+.macro SAVE1x2
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT4x1
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+
+.endm
+
+
+.macro KERNEL4x1
+
+	vbroadcastsd	-12 * SIZE(BO), %ymm0
+	vbroadcastsd	-11 * SIZE(BO), %ymm1
+	vbroadcastsd	-10 * SIZE(BO), %ymm2
+	vbroadcastsd	-9  * SIZE(BO), %ymm3
+
+	vfmadd231pd  	-16 * SIZE(AO) ,%ymm0  , %ymm4
+	vfmadd231pd  	-12 * SIZE(AO) ,%ymm1  , %ymm5
+
+	vbroadcastsd	-8  * SIZE(BO), %ymm0
+	vbroadcastsd	-7  * SIZE(BO), %ymm1
+
+	vfmadd231pd  	-8  * SIZE(AO) ,%ymm2  , %ymm6
+	vfmadd231pd  	-4  * SIZE(AO) ,%ymm3  , %ymm7
+
+	vbroadcastsd	-6  * SIZE(BO), %ymm2
+	vbroadcastsd	-5  * SIZE(BO), %ymm3
+
+	vfmadd231pd  	 0  * SIZE(AO) ,%ymm0  , %ymm4
+	vfmadd231pd  	 4  * SIZE(AO) ,%ymm1  , %ymm5
+	vfmadd231pd  	 8  * SIZE(AO) ,%ymm2  , %ymm6
+	vfmadd231pd  	 12 * SIZE(AO) ,%ymm3  , %ymm7
+
+	addq		$ 8 *SIZE, BO
+	addq		$ 32*SIZE, AO
+
+.endm
+
+
+.macro KERNEL4x1_SUB
+	vbroadcastsd	-12 * SIZE(BO), %ymm2
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm4
+	addq		$ 1*SIZE, BO
+	addq		$ 4*SIZE, AO
+
+.endm
+
+
+.macro SAVE4x1
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vaddpd	%ymm4,%ymm5, %ymm4 
+	vaddpd	%ymm6,%ymm7, %ymm6 
+	vaddpd	%ymm4,%ymm6, %ymm4 
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %ymm4, %ymm4
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT2x1
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+
+.endm
+
+
+.macro KERNEL2x1_SUB
+	vmovddup	-12 * SIZE(BO), %xmm2
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
+	addq		$ 1*SIZE, BO
+	addq		$ 2*SIZE, AO
+
+.endm
+
+
+.macro SAVE2x1
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %xmm4, %xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT1x1
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+
+.endm
+
+
+.macro KERNEL1x1_SUB
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	addq		$ 1*SIZE, BO
+	addq		$ 1*SIZE, AO
+
+.endm
+
+
+.macro SAVE1x1
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+.macro PREFETCHT0_C
+        prefetcht0 (CO1)
+        prefetcht0 24(CO1)
+        prefetcht0 (CO1,LDC,4)
+        prefetcht0 24(CO1,LDC,4)
+        prefetcht0 (CO1,LDC,8)
+        prefetcht0 24(CO1,LDC,8)
+.endm
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+
+	vmovups	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $24,  %rdi
+        divq    %rdi                     //    N / 24
+        movq    %rax, Ndiv12             //    N / 24
+        movq    %rdx, Nmod12             //    N % 24
+
+
+	movq	Ndiv12,  J
+	cmpq	$ 0, J
+	je	.L8_0
+	ALIGN_4
+
+.L12_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $3,%rax                 // K * 8 ; read 8 values from BO1
+        movq    B, BO1
+        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
+	movq	BO2 , B			
+
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+
+        ALIGN_4
+
+.L12_02b:
+
+	vmovups	0 * SIZE(BO1), %ymm1
+	vmovups	4 * SIZE(BO1), %ymm2
+	vmovups	0 * SIZE(BO2), %ymm3
+	vmovups	%ymm1, 0 * SIZE(BO)
+	vmovups	%ymm2, 4 * SIZE(BO)
+	vmovups	%ymm3, 8 * SIZE(BO)
+	addq	$ 8*SIZE,BO1
+	addq	$ 8*SIZE,BO2
+	addq	$ 12*SIZE,BO
+	decq	%rax
+	jnz	.L12_02b
+
+.L12_03c:
+
+
+.L12_10:
+	movq	C, CO1
+	leaq	(C, LDC, 8), C		 
+	leaq	(C, LDC, 4), C		// c += 12 * ldc
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L12_20
+
+	ALIGN_4
+
+.L12_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+        movq    K, %rax
+
+	sarq $3, %rax			//  K / 8
+	cmpq $2, %rax
+
+	jl	.L12_13
+
+
+	KERNEL4x12_I
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	subq $2, %rax
+	je	.L12_12a
+
+	ALIGN_5
+.L12_12:
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	dec	%rax
+	jne	.L12_12
+	
+.L12_12a:
+        prefetcht0 ALPHA
+        PREFETCHT0_C
+        addq  LDC,CO1
+	KERNEL4x12_M1
+        PREFETCHT0_C
+        leaq  (CO1,LDC,2),CO1
+	KERNEL4x12_M2
+        PREFETCHT0_C
+        subq  LDC,CO1
+	KERNEL4x12_M1
+        PREFETCHT0_C
+        subq  LDC,CO1
+        subq  LDC,CO1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_E
+
+	jmp .L12_16
+
+
+.L12_13:
+
+	test $1, %rax
+	jz .L12_14
+
+	KERNEL4x12_I
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_E
+
+	jmp .L12_16
+
+
+.L12_14:
+
+	INIT4x12
+
+
+.L12_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L12_19
+
+	ALIGN_4
+
+.L12_17:
+
+	KERNEL4x12_SUB
+
+	dec	%rax
+	jne	.L12_17
+	ALIGN_4
+
+
+.L12_19:
+
+	SAVE4x12
+
+        /* here for the prefetch of next b source block */
+	/* the increment should be proportional to GEMM_Q/GEMM_P */
+
+        salq    $3, K
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+        prefetcht2 32(B)
+        prefetcht2 32(B, K, 8)
+        addq    $64, B /* increment */
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+        prefetcht2 32(B)
+        prefetcht2 32(B, K, 8)
+        prefetcht2 96(B)
+        prefetcht2 96(B, K, 8)
+        addq    $128, B /* increment */
+#endif
+        sarq    $3, K
+
+	decq	I			# i --
+	jne	.L12_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+
+        /* recover the original value of pointer B after prefetch */
+        movq    M, I
+        sarq    $2, I
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+        salq    $6, I
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+        salq    $7, I
+#endif
+        subq    I, B
+
+.L12_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L12_100			// to next 16 lines of N
+
+
+.L12_30:
+	testq	$2, M		
+	jz	.L12_40
+
+	ALIGN_4
+
+.L12_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x12
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L12_36
+	ALIGN_4
+
+.L12_32:
+
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+
+	dec %rax
+	jne	.L12_32
+	ALIGN_4
+
+.L12_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L12_39
+
+	ALIGN_4
+
+.L12_37:
+
+	KERNEL2x12_SUB
+
+	dec %rax
+	jne	.L12_37
+	ALIGN_4
+
+
+.L12_39:
+
+	SAVE2x12
+
+	ALIGN_4
+
+.L12_40:
+	testq	$1, M		
+	jz	.L12_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L12_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x12
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L12_46
+
+	ALIGN_4
+
+.L12_42:
+
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+
+
+	dec %rax
+	jne	.L12_42
+	ALIGN_4
+
+.L12_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L12_49
+
+	ALIGN_4
+
+.L12_47:
+
+	KERNEL1x12_SUB
+
+	dec	%rax
+	jne	.L12_47
+	ALIGN_4
+
+
+.L12_49:
+
+	SAVE1x12
+
+	ALIGN_4
+	
+.L12_100:
+
+
+
+/**************************************************************************************************/
+
+.L13_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $3,%rax                 // K * 8 ; read 8 values
+        movq    B, BO2
+        leaq    (B,%rax, SIZE), BO3     // next offset to BO2
+        leaq    (BO3,%rax, SIZE), B     // next offset to B
+
+
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+
+        ALIGN_4
+
+
+.L13_02b:
+
+	vmovups	4 * SIZE(BO2), %ymm1
+	vmovups	0 * SIZE(BO3), %ymm2
+	vmovups	4 * SIZE(BO3), %ymm3
+	vmovups	%ymm1, 0 * SIZE(BO)
+	vmovups	%ymm2, 4 * SIZE(BO)
+	vmovups	%ymm3, 8 * SIZE(BO)
+	addq	$ 8*SIZE,BO2
+	addq	$ 8*SIZE,BO3
+	addq	$ 12*SIZE,BO
+	decq	%rax
+	jnz	.L13_02b
+
+
+
+.L13_10:
+	movq	C, CO1
+	leaq	(C, LDC, 8), C		 
+	leaq	(C, LDC, 4), C		// c += 12 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L13_20
+
+	ALIGN_4
+
+.L13_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+        movq    K, %rax
+
+	sarq $3, %rax			//  K / 8
+	cmpq $2, %rax
+
+	jl	.L13_13
+
+
+	KERNEL4x12_I
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	subq $2, %rax
+	je	.L13_12a
+
+	ALIGN_5
+.L13_12:
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	dec	%rax
+	jne	.L13_12
+
+.L13_12a:
+        prefetcht0 ALPHA
+        PREFETCHT0_C
+        addq  LDC,CO1
+	KERNEL4x12_M1
+        PREFETCHT0_C
+        leaq  (CO1,LDC,2),CO1
+	KERNEL4x12_M2
+        PREFETCHT0_C
+        subq  LDC,CO1
+	KERNEL4x12_M1
+        PREFETCHT0_C
+        subq  LDC,CO1
+        subq  LDC,CO1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_E
+
+	jmp .L13_16
+
+.L13_13:
+
+	test $1, %rax
+	jz .L13_14
+
+	KERNEL4x12_I
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_E
+
+	jmp .L13_16
+
+
+.L13_14:
+
+	INIT4x12
+
+
+.L13_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L13_19
+
+	ALIGN_4
+
+.L13_17:
+
+	KERNEL4x12_SUB
+
+	dec	%rax
+	jne	.L13_17
+	ALIGN_4
+
+
+.L13_19:
+
+	SAVE4x12
+
+        /* here for the prefetch of next b source block */
+	/* the increment should be proportional to GEMM_Q/GEMM_P */
+
+        salq    $3, K
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+        prefetcht2 (B)
+        prefetcht2 (B, K, 8)
+        addq    $64, B /* increment */
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+        prefetcht2 (B)
+        prefetcht2 (B, K, 8)
+        prefetcht2 64(B)
+        prefetcht2 64(B, K, 8)
+        addq    $128, B /* increment */
+#endif
+        sarq    $3, K
+
+	decq	I			# i --
+	jne	.L13_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+        /* recover the original value of pointer B */
+        movq    M, I
+        sarq    $2, I
+#ifdef WINDOWS_ABI /* GEMM_P == GEMM_Q * 4 */
+        salq    $6, I
+#else /* GEMM_P == GEMM_Q * 2 under linux x86_64 */
+        salq    $7, I
+#endif
+        subq    I, B
+
+.L13_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L13_100			// to next 16 lines of N
+
+
+.L13_30:
+	testq	$2, M		
+	jz	.L13_40
+
+	ALIGN_4
+
+.L13_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x12
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L13_36
+	ALIGN_4
+
+.L13_32:
+
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+
+	dec %rax
+	jne	.L13_32
+	ALIGN_4
+
+.L13_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L13_39
+
+	ALIGN_4
+
+.L13_37:
+
+	KERNEL2x12_SUB
+
+	dec %rax
+	jne	.L13_37
+	ALIGN_4
+
+
+.L13_39:
+
+	SAVE2x12
+
+	ALIGN_4
+
+.L13_40:
+	testq	$1, M		
+	jz	.L13_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L13_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x12
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L13_46
+
+	ALIGN_4
+
+.L13_42:
+
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+
+
+	dec %rax
+	jne	.L13_42
+	ALIGN_4
+
+.L13_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L13_49
+
+	ALIGN_4
+
+.L13_47:
+
+	KERNEL1x12_SUB
+
+	dec	%rax
+	jne	.L13_47
+	ALIGN_4
+
+
+.L13_49:
+
+	SAVE1x12
+
+	ALIGN_4
+	
+.L13_100:
+
+	decq	J			// j --
+	jg	.L12_01
+
+
+
+
+/**************************************************************************************************/
+
+.L8_0:
+
+	cmpq	$ 0, Nmod12		// N % 12 == 0
+	je	.L999
+
+	movq	Nmod12, J		
+	sarq	$3, J			// j = j / 8
+	je	.L4_0
+
+.L8_10:
+	movq	C, CO1
+	leaq	(C, LDC, 8), C		// c += 4 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L8_20
+
+	ALIGN_4
+
+.L8_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+        movq    K, %rax
+
+	sarq	$3, %rax			//  K / 8
+	cmpq    $2, %rax
+	jl	.L8_13
+
+
+	KERNEL4x8_I
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	subq $2, %rax
+	je	.L8_12a
+
+	ALIGN_5
+
+.L8_12:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	dec	%rax
+	jne	.L8_12
+
+.L8_12a:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	jmp .L8_16
+
+
+.L8_13:
+
+	test $1, %rax
+	jz .L8_14
+
+	KERNEL4x8_I
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	jmp .L8_16
+
+
+.L8_14:
+
+	INIT4x8
+
+
+.L8_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_19
+
+	ALIGN_4
+
+.L8_17:
+
+	KERNEL4x8_SUB
+
+	dec	%rax
+	jne	.L8_17
+	ALIGN_4
+
+
+.L8_19:
+
+	SAVE4x8
+
+	decq	I			# i --
+	jg	.L8_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L8_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L8_100			// to next 16 lines of N
+
+
+.L8_30:
+	testq	$2, M		
+	jz	.L8_40
+
+	ALIGN_4
+
+.L8_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x8
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L8_36
+	ALIGN_4
+
+.L8_32:
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	dec %rax
+	jne	.L8_32
+	ALIGN_4
+
+.L8_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_39
+
+	ALIGN_4
+
+.L8_37:
+
+	KERNEL2x8_SUB
+
+	dec %rax
+	jne	.L8_37
+
+
+.L8_39:
+
+	SAVE2x8
+
+.L8_40:
+	testq	$1, M		
+	jz	.L8_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L8_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x8
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L8_46
+
+	ALIGN_4
+
+.L8_42:
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	dec %rax
+	jne	.L8_42
+	ALIGN_4
+
+.L8_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_49
+
+	ALIGN_4
+
+.L8_47:
+
+	KERNEL1x8_SUB
+
+	dec	%rax
+	jne	.L8_47
+	ALIGN_4
+
+
+.L8_49:
+
+	SAVE1x8
+
+	ALIGN_4
+	
+.L8_100:
+
+	movq	K, %rax
+	salq	$3, %rax		// * 8
+	leaq	(B , %rax, SIZE), B
+	decq	J			// j --
+	jg	.L8_10
+
+
+
+/**************************************************************************************************/
+
+.L4_0:
+
+	cmpq	$ 0, Nmod12		// N % 12 == 0
+	je	.L999
+
+	movq	Nmod12, J		
+	testq   $4, J			// j = j / 4
+	je	.L2_0
+
+.L4_10:
+	movq	C, CO1
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+        movq    K, %rax
+
+	sarq	$3, %rax			//  K / 8
+	cmpq    $2, %rax
+	jl	.L4_13
+
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subq $2, %rax
+	je	.L4_12a
+
+	ALIGN_5
+
+.L4_12:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	dec	%rax
+	jne	.L4_12
+
+.L4_12a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_13:
+
+	test $1, %rax
+	jz .L4_14
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_14:
+
+	INIT4x4
+
+
+.L4_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL4x4_SUB
+
+	dec	%rax
+	jne	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE4x4
+
+	decq	I			# i --
+	jg	.L4_11
+
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L4_100			// to next 16 lines of N
+
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x4
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L4_36
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_32
+	ALIGN_4
+
+.L4_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_37
+
+
+.L4_39:
+
+	SAVE2x4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L4_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x4
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L4_46
+
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	dec %rax
+	jne	.L4_42
+	ALIGN_4
+
+.L4_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	dec	%rax
+	jne	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+	ALIGN_4
+	
+.L4_100:
+
+	movq	K, %rax
+	salq	$2, %rax		// * 4
+	leaq	(B , %rax, SIZE), B
+
+
+
+
+/***************************************************************************************************************/
+
+.L2_0:
+
+	movq	Nmod12, J		
+	testq	$2, J
+	je	.L1_0
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+	INIT4x2
+
+        movq    K, %rax
+	sarq $3, %rax			//  K / 8
+
+	je	.L2_16
+
+	ALIGN_5
+
+.L2_12:
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_12
+
+
+.L2_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE4x2
+
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L2_100			// to next 16 lines of N
+
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x2
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L2_36
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_32
+
+.L2_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_37
+
+
+.L2_39:
+
+	SAVE2x2
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_100		// to next 3 lines of N
+
+.L2_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x2
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L2_46
+
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	dec %rax
+	jne	.L2_42
+
+.L2_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	dec	%rax
+	jne	.L2_47
+
+.L2_49:
+
+	SAVE1x2
+
+.L2_100:
+
+	movq	K, %rax
+	salq	$1, %rax		// * 2
+	leaq	(B , %rax, SIZE), B
+
+/***************************************************************************************************************/
+
+.L1_0:
+
+	movq	Nmod12, J		
+	testq	$1, J
+	je	.L999
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+	INIT4x1
+
+        movq    K, %rax
+
+	sarq	$3, %rax			//  K / 8
+	je	.L1_16
+
+	ALIGN_5
+
+.L1_12:
+
+	KERNEL4x1
+
+	dec	%rax
+	jne	.L1_12
+
+
+.L1_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL4x1_SUB
+
+	dec	%rax
+	jne	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE4x1
+
+	decq	I			# i --
+	jg	.L1_11
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L1_100	
+
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x1
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L1_36
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+
+	dec %rax
+	jne	.L1_32
+
+.L1_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	dec %rax
+	jne	.L1_37
+
+.L1_39:
+
+	SAVE2x1
+
+.L1_40:
+	testq	$1, M		
+	jz	.L1_100		// to next 3 lines of N
+
+
+.L1_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x1
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L1_46
+
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	dec %rax
+	jne	.L1_42
+
+.L1_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	dec	%rax
+	jne	.L1_47
+
+
+.L1_49:
+
+	SAVE1x1
+
+.L1_100:
+
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovups	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	vmovsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $8,  %rdi
+        divq    %rdi                     //    N / 8
+        movq    %rax, Ndiv12             //    N / 8
+        movq    %rdx, Nmod12             //    N % 8
+
+#ifdef TRMMKERNEL
+        vmovsd  %xmm12, OFFSET
+        vmovsd  %xmm12, KK
+#ifndef LEFT
+        negq    KK
+#endif  
+#endif
+
+/*************************************************************************************************/
+.L8_0:
+	movq	Ndiv12,  J
+	cmpq	$ 0, J
+	je	.L4_0
+	ALIGN_4
+
+.L8_10:
+	movq	C, CO1
+	leaq	(C, LDC, 8), C		// c += 8 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L8_20
+
+	ALIGN_4
+
+.L8_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,8), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $8, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	sarq	$3, %rax			//  K / 8
+	cmpq    $2, %rax
+	jl	.L8_13
+
+
+	KERNEL4x8_I
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	subq $2, %rax
+	je	.L8_12a
+
+	ALIGN_5
+
+.L8_12:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	dec	%rax
+	jne	.L8_12
+
+.L8_12a:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	jmp .L8_16
+
+
+.L8_13:
+
+	test $1, %rax
+	jz .L8_14
+
+	KERNEL4x8_I
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	jmp .L8_16
+
+
+.L8_14:
+
+	INIT4x8
+
+
+.L8_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_19
+
+	ALIGN_4
+
+.L8_17:
+
+	KERNEL4x8_SUB
+
+	dec	%rax
+	jne	.L8_17
+	ALIGN_4
+
+
+.L8_19:
+
+	SAVE4x8
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 8), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+	decq	I			# i --
+	jg	.L8_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L8_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L8_100			// to next 16 lines of N
+
+
+.L8_30:
+	testq	$2, M		
+	jz	.L8_40
+
+	ALIGN_4
+
+.L8_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,8), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $8, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x8
+
+	sarq	$3, %rax
+	je	.L8_36
+	ALIGN_4
+
+.L8_32:
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	dec %rax
+	jne	.L8_32
+	ALIGN_4
+
+.L8_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_39
+
+	ALIGN_4
+
+.L8_37:
+
+	KERNEL2x8_SUB
+
+	dec %rax
+	jne	.L8_37
+
+
+.L8_39:
+
+	SAVE2x8
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 8), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L8_40:
+	testq	$1, M		
+	jz	.L8_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L8_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,8), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $8, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x8
+
+	sarq	$3,%rax
+	je	.L8_46
+
+	ALIGN_4
+
+.L8_42:
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	dec %rax
+	jne	.L8_42
+	ALIGN_4
+
+.L8_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_49
+
+	ALIGN_4
+
+.L8_47:
+
+	KERNEL1x8_SUB
+
+	dec	%rax
+	jne	.L8_47
+	ALIGN_4
+
+
+.L8_49:
+
+	SAVE1x8
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 8), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+.L8_100:
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $8, KK				// number of values in B
+#endif
+
+
+	decq	J			// j --
+	jg	.L8_10
+
+
+
+
+
+/*************************************************************************************************/
+.L4_0:
+	movq	Nmod12, J		
+	testq	$4, J
+	je	.L2_0
+	ALIGN_4
+
+.L4_10:
+	movq	C, CO1
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,4), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	sarq	$3, %rax			//  K / 8
+	cmpq    $2, %rax
+	jl	.L4_13
+
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subq $2, %rax
+	je	.L4_12a
+
+	ALIGN_5
+
+.L4_12:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	dec	%rax
+	jne	.L4_12
+
+.L4_12a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_13:
+
+	test $1, %rax
+	jz .L4_14
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_14:
+
+	INIT4x4
+
+
+.L4_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL4x4_SUB
+
+	dec	%rax
+	jne	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 4), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L4_100			// to next 16 lines of N
+
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,4), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x4
+
+	sarq	$3, %rax
+	je	.L4_36
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_32
+	ALIGN_4
+
+.L4_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_37
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 4), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L4_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,4), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x4
+
+	sarq	$3,%rax
+	je	.L4_46
+
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	dec %rax
+	jne	.L4_42
+	ALIGN_4
+
+.L4_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	dec	%rax
+	jne	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 4), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+.L4_100:
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK				// number of values in B
+#endif
+
+
+	movq	K, %rax
+	salq	$2, %rax		// * 4
+	leaq	(B , %rax, SIZE), B
+
+
+
+
+/***************************************************************************************************************/
+
+.L2_0:
+
+	movq	Nmod12, J		
+	testq	$2, J
+	je	.L1_0
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,2), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT4x2
+
+	sarq $3, %rax			//  K / 8
+
+	je	.L2_16
+
+	ALIGN_5
+
+.L2_12:
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_12
+
+
+.L2_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 2), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L2_100			// to next 16 lines of N
+
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,2), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x2
+
+	sarq	$3, %rax
+	je	.L2_36
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_32
+
+.L2_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_37
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 2), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_100		// to next 3 lines of N
+
+.L2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,2), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x2
+
+	sarq	$3,%rax
+	je	.L2_46
+
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	dec %rax
+	jne	.L2_42
+
+.L2_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	dec	%rax
+	jne	.L2_47
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 2), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+
+.L2_100:
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK				// number of values in B
+#endif
+
+	movq	K, %rax
+	salq	$1, %rax		// * 2
+	leaq	(B , %rax, SIZE), B
+
+/***************************************************************************************************************/
+
+.L1_0:
+
+	movq	Nmod12, J		
+	testq	$1, J
+	je	.L999
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,1), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT4x1
+
+	sarq	$3, %rax			//  K / 8
+	je	.L1_16
+
+	ALIGN_5
+
+.L1_12:
+
+	KERNEL4x1
+
+	dec	%rax
+	jne	.L1_12
+
+
+.L1_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL4x1_SUB
+
+	dec	%rax
+	jne	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 1), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+
+	decq	I			# i --
+	jg	.L1_11
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L1_100	
+
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,1), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x1
+
+	sarq	$3, %rax
+	je	.L1_36
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+
+	dec %rax
+	jne	.L1_32
+
+.L1_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	dec %rax
+	jne	.L1_37
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 1), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L1_40:
+	testq	$1, M		
+	jz	.L1_100		// to next 3 lines of N
+
+
+.L1_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,1), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x1
+
+	sarq	$3,%rax
+	je	.L1_46
+
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	dec %rax
+	jne	.L1_42
+
+.L1_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	dec	%rax
+	jne	.L1_47
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 1), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+
+
+.L1_100:
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $1, KK				// number of values in B
+#endif
+
+
+
+.L999:
+
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+
+
+#endif
diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
index 90a4c2b1d..a5daffb94 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
+++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
@@ -1,670 +1,670 @@
-#include "common.h"
-#include <stdint.h>
-#include <immintrin.h>
-
-//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
-
-/* row-major c_block */
-#define INNER_KERNEL_k1m1n8 \
-    "prefetcht0 384(%1);"\
-    "vmovupd (%1),%%zmm5; addq $64,%1;"\
-    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;"
-
-#define INNER_KERNEL_k1m2n8 \
-    INNER_KERNEL_k1m1n8\
-    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
-
-#define INNER_KERNEL_k1m1n16 \
-    "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
-    "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
-    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
-
-#define INNER_KERNEL_k1m2n16 \
-    INNER_KERNEL_k1m1n16\
-    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
-
-#define INNER_KERNEL_k1m1n24 \
-    "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
-    "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
-    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
-
-#define INNER_KERNEL_k1m2n24 \
-    INNER_KERNEL_k1m1n24\
-    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
-
-/* row-major z-partition c_block */
-#define INNER_KERNEL_k1m4n8 \
-    "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\
-    "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\
-    "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;"
-
-#define INNER_KERNEL_k1m4n16 \
-    INNER_KERNEL_k1m4n8\
-    "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\
-    "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;"
-
-#define INNER_KERNEL_k1m4n24 \
-    INNER_KERNEL_k1m4n16\
-    "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\
-    "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;"
-
-#define INNER_KERNEL_k1m8n8 \
-    "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\
-    "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\
-    "prefetcht0 128(%1);"\
-    "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\
-    "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\
-    "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\
-    "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;"
-
-#define INNER_KERNEL_k1m8n16 \
-    INNER_KERNEL_k1m8n8\
-    "prefetcht0 128(%1,%%r12,2);"\
-    "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\
-    "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\
-    "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\
-    "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;"
-
-#define INNER_KERNEL_k1m8n24 \
-    INNER_KERNEL_k1m8n16\
-    "prefetcht0 128(%1,%%r12,4);"\
-    "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\
-    "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\
-    "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\
-    "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;"
-
-/* micro kernels */
-#define INNER_KERNELm1(nn) \
-    "cmpq $1,%2;jb "#nn"3f;"\
-    #nn"4:\n\t"\
-    INNER_KERNEL_k1m1n##nn "addq $8,%0;"\
-    "decq %2;cmpq $1,%2;jnb "#nn"4b;"\
-    #nn"3:\n\t"
-
-#define INNER_KERNELm2(nn) \
-    "cmpq $1,%2;jb "#nn"0f;"\
-    #nn"1:\n\t"\
-    INNER_KERNEL_k1m2n##nn "addq $16,%0;"\
-    "decq %2;cmpq $1,%2;jnb "#nn"1b;"\
-    #nn"0:\n\t"
-
-#define INNER_KERNELm4(nn) \
-    "cmpq $1,%2;jb "#nn"00f;"\
-    #nn"01:\n\t"\
-    INNER_KERNEL_k1m4n##nn "addq $64,%1;"\
-    "decq %2;cmpq $1,%2;jnb "#nn"01b;"\
-    #nn"00:\n\t"
-
-/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
-#define INNER_KERNELm8(nn) \
-    "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\
-    #nn"008:\n\t"\
-    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
-    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
-    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
-    "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
-    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
-    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
-    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
-    "prefetcht1 (%11); addq $32,%11;"\
-    "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\
-    "movq %3,%10;"\
-    #nn"001:\n\t"\
-    "cmpq $1,%2;jb "#nn"000f;"\
-    "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
-    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
-    "decq %2;jmp "#nn"001b;"\
-    ""#nn"000:\n\t"
-
-#define INNER_INIT_m1n8 \
-    "vpxorq %%zmm8, %%zmm8, %%zmm8;"
-
-#define INNER_INIT_m2n8 \
-    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;"
-
-#define INNER_INIT_m4n8 \
-    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
-
-#define INNER_INIT_m8n8 \
-    INNER_INIT_m4n8\
-    "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;"
-
-#define INNER_INIT_m1n16 INNER_INIT_m2n8
-
-#define INNER_INIT_m2n16 INNER_INIT_m4n8
-
-#define INNER_INIT_m4n16 INNER_INIT_m8n8
-
-#define INNER_INIT_m8n16 \
-    INNER_INIT_m8n8\
-    "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\
-    "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;"
-
-#define INNER_INIT_m1n24 \
-    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;"
-
-#define INNER_INIT_m2n24 \
-    INNER_INIT_m1n24\
-    "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;"
-
-#define INNER_INIT_m4n24 \
-    INNER_INIT_m4n16\
-    "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"
-
-#define INNER_INIT_m8n24 \
-    INNER_INIT_m8n16\
-    "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\
-    "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;"
-
-#define INNER_SETINDEX \
-    "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\
-    "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\
-    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
-    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
-    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
-    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
-    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
-    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"
-
-#define INNER_STORE_m1n8(c1,disp) \
-    "kxnorw %%k1,%%k1,%%k1;"\
-    "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\
-    "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
-    "kxnorw %%k1,%%k1,%%k1;"\
-    "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};"
-
-#define INNER_SAVE_m1n8 \
-    "movq %3,%10;"\
-    INNER_SETINDEX\
-    INNER_STORE_m1n8(%%zmm8,0)
-
-#define INNER_SAVE_m1n16 \
-    INNER_SAVE_m1n8\
-    "leaq (%10,%4,8),%10;"\
-    INNER_STORE_m1n8(%%zmm9,0)
-
-#define INNER_SAVE_m1n24 \
-    INNER_SAVE_m1n16\
-    "leaq (%10,%4,8),%10;"\
-    INNER_STORE_m1n8(%%zmm10,0)
-
-#define INNER_SAVE_m2n8 \
-    "movq %3,%10;"\
-    INNER_SETINDEX\
-    INNER_STORE_m1n8(%%zmm8,0)\
-    INNER_STORE_m1n8(%%zmm9,8)
-
-#define INNER_SAVE_m2n16 \
-    "movq %3,%10;"\
-    INNER_SETINDEX\
-    INNER_STORE_m1n8(%%zmm8,0)\
-    INNER_STORE_m1n8(%%zmm10,8)\
-    "leaq (%10,%4,8),%10;"\
-    INNER_STORE_m1n8(%%zmm9,0)\
-    INNER_STORE_m1n8(%%zmm11,8)
-
-#define INNER_SAVE_m2n24 \
-    "movq %3,%10;"\
-    INNER_SETINDEX\
-    INNER_STORE_m1n8(%%zmm8,0)\
-    INNER_STORE_m1n8(%%zmm11,8)\
-    "leaq (%10,%4,8),%10;"\
-    INNER_STORE_m1n8(%%zmm9,0)\
-    INNER_STORE_m1n8(%%zmm12,8)\
-    "leaq (%10,%4,8),%10;"\
-    INNER_STORE_m1n8(%%zmm10,0)\
-    INNER_STORE_m1n8(%%zmm13,8)
-
-#define INNER_TRANS_4x8(c1,c2,c3,c4) \
-    "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\
-    "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\
-    "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\
-    "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\
-
-#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \
-    "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\
-    "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\
-    "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\
-    "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};"
-
-#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
-    INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8)
-
-//%7 for k01(input) only when m=4
-#define INNER_STORE_4x8(c1,c2,c3,c4) \
-    "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
-    "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
-    "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
-    "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
-    "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
-    "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
-    "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
-    "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
-    "leaq (%10,%4,4),%10;"
-
-#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
-    "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\
-    "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\
-    "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\
-    "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;"
-
-#define INNER_SAVE_m4n8 \
-    "movq %3,%10;"\
-    INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
-    INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
-
-#define INNER_SAVE_m4n16 \
-    INNER_SAVE_m4n8\
-    INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
-    INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
-
-#define INNER_SAVE_m4n24 \
-    INNER_SAVE_m4n16\
-    INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\
-    INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
-
-#define INNER_SAVE_m8n8 \
-    "movq %3,%10;"\
-    INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
-    INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
-
-#define INNER_SAVE_m8n16 \
-    INNER_SAVE_m8n8\
-    INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\
-    INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)
-
-#define INNER_SAVE_m8n24 \
-    INNER_SAVE_m8n16\
-    INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\
-    INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)
-
-#define COMPUTE_n8 {\
-    b_pref = packed_b_pointer + 8 * K;\
-    __asm__ __volatile__(\
-    "vbroadcastsd (%9),%%zmm3;"\
-    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
-    "cmpq $8,%8; jb 42222f;"\
-    "42221:\n\t"\
-    INNER_INIT_m8n8\
-    INNER_KERNELm8(8)\
-    INNER_SAVE_m8n8\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
-    "addq $64,%3;"\
-    "subq $8,%8; cmpq $8,%8; jnb 42221b;"\
-    "42222:\n\t"\
-    "cmpq $4,%8; jb 42223f;"\
-    INNER_INIT_m4n8\
-    INNER_KERNELm4(8)\
-    INNER_SAVE_m4n8\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "addq $32,%3;"\
-    "subq $4,%8;"\
-    "42223:\n\t"\
-    "cmpq $2,%8; jb 42224f;"\
-    INNER_INIT_m2n8\
-    INNER_KERNELm2(8)\
-    INNER_SAVE_m2n8\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "addq $16,%3;"\
-    "subq $2,%8;"\
-    "42224:\n\t"\
-    "cmpq $1,%8; jb 42225f;"\
-    INNER_INIT_m1n8\
-    INNER_KERNELm1(8)\
-    INNER_SAVE_m1n8\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "addq $8,%3;"\
-    "42225:\n\t"\
-    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
-    "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
-    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
-    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
-    ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
-    a_block_pointer -= M * K;\
-}
-#define COMPUTE_n16 {\
-    b_pref = packed_b_pointer + 16 * K;\
-    __asm__ __volatile__(\
-    "vbroadcastsd (%9),%%zmm3;"\
-    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
-    "cmpq $8,%8; jb 32222f;"\
-    "32221:\n\t"\
-    INNER_INIT_m8n16\
-    INNER_KERNELm8(16)\
-    INNER_SAVE_m8n16\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
-    "addq $64,%3;"\
-    "subq $8,%8; cmpq $8,%8; jnb 32221b;"\
-    "32222:\n\t"\
-    "cmpq $4,%8; jb 32223f;"\
-    INNER_INIT_m4n16\
-    INNER_KERNELm4(16)\
-    INNER_SAVE_m4n16\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "addq $32,%3;"\
-    "subq $4,%8;"\
-    "32223:\n\t"\
-    "cmpq $2,%8; jb 32224f;"\
-    INNER_INIT_m2n16\
-    INNER_KERNELm2(16)\
-    INNER_SAVE_m2n16\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "addq $16,%3;"\
-    "subq $2,%8;"\
-    "32224:\n\t"\
-    "cmpq $1,%8; jb 32225f;"\
-    INNER_INIT_m1n16\
-    INNER_KERNELm1(16)\
-    INNER_SAVE_m1n16\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "addq $8,%3;"\
-    "32225:\n\t"\
-    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
-    "shlq $4,%4;addq %4,%3;shrq $4,%4;"\
-    "leaq (%1,%%r12,4),%1;"\
-    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
-    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
-    ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
-    "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
-    a_block_pointer -= M * K;\
-}
-#define COMPUTE_n24 {\
-    b_pref = packed_b_pointer + 24 * K;\
-    __asm__ __volatile__(\
-    "vbroadcastsd (%9),%%zmm3;"\
-    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
-    "cmpq $8,%8; jb 22222f;"\
-    "22221:\n\t"\
-    INNER_INIT_m8n24\
-    INNER_KERNELm8(24)\
-    INNER_SAVE_m8n24\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
-    "addq $64,%3;"\
-    "subq $8,%8; cmpq $8,%8; jnb 22221b;"\
-    "22222:\n\t"\
-    "cmpq $4,%8; jb 22223f;"\
-    INNER_INIT_m4n24\
-    INNER_KERNELm4(24)\
-    INNER_SAVE_m4n24\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "addq $32,%3;"\
-    "subq $4,%8;"\
-    "22223:\n\t"\
-    "cmpq $2,%8; jb 22224f;"\
-    INNER_INIT_m2n24\
-    INNER_KERNELm2(24)\
-    INNER_SAVE_m2n24\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "addq $16,%3;"\
-    "subq $2,%8;"\
-    "22224:\n\t"\
-    "cmpq $1,%8; jb 22225f;"\
-    INNER_INIT_m1n24\
-    INNER_KERNELm1(24)\
-    INNER_SAVE_m1n24\
-    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "addq $8,%3;"\
-    "22225:\n\t"\
-    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
-    "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
-    "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
-    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
-    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\
-    "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\
-    "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
-    a_block_pointer -= M * K;\
-}
-static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8
-//perform C += A<pack> B<pack>
-    if(k==0 || m==0 || ndiv8==0) return;
-    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
-    int64_t K = (int64_t)k; int64_t M = (int64_t)m;
-    double *a_block_pointer,*b_pref;
-    double *c_pointer = c,*c_store = c;
-    __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
-    BLASLONG ndiv8_count;
-    double *packed_b_pointer = packed_b;
-    a_block_pointer = packed_a;
-    for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
-      COMPUTE_n24
-    }
-    for(;ndiv8_count>1;ndiv8_count-=2){
-      COMPUTE_n16
-    }
-    if(ndiv8_count>0){
-      COMPUTE_n8
-    }
-}
-
-/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */
-/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */
-/*  double accumulator:  sc1;     temporary variables: sa1,sb1 */
-/* column-major c_block */
-#define KERNEL_m4n4k1 {\
-    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
-    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
-    yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
-    yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\
-    yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\
-    b_block_pointer+=4;\
-}
-#define KERNEL_m4n2k1 {\
-    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
-    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
-    yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
-    b_block_pointer+=2;\
-}
-#define KERNEL_m4n1k1 {\
-    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
-    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
-    b_block_pointer++;\
-}
-#define INIT_m4n1 yc1=_mm256_setzero_pd();
-#define INIT_m4n2 yc2=INIT_m4n1
-#define INIT_m4n4 yc4=yc3=INIT_m4n2
-#define SAVE_m4n1 {\
-    yb1 = _mm256_broadcast_sd(alpha);\
-    ya1 = _mm256_loadu_pd(c_pointer);\
-    yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\
-    _mm256_storeu_pd(c_pointer,yc1);\
-    c_pointer += 4;\
-}
-#define SAVE_m4n2 {\
-    ya1 = _mm256_broadcast_sd(alpha);\
-    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
-    yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
-    _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
-    c_pointer += 4;\
-}
-#define SAVE_m4n4 {\
-    ya1 = _mm256_broadcast_sd(alpha);\
-    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
-    yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
-    _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
-    c_pointer += LDC*2;\
-    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
-    yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\
-    _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\
-    c_pointer += 4-LDC*2;\
-}
-#define KERNEL_m2n2k1 {\
-    xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
-    xb1 = _mm_loaddup_pd(b_block_pointer);   xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
-    xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\
-    b_block_pointer += 2;\
-}
-#define KERNEL_m2n1k1 {\
-    xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
-    xb1 = _mm_loaddup_pd(b_block_pointer);   xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
-    b_block_pointer ++;\
-}
-#define INIT_m2n1 xc1=_mm_setzero_pd();
-#define INIT_m2n2 xc2=INIT_m2n1
-#define SAVE_m2n1 {\
-    xb1 = _mm_loaddup_pd(alpha);\
-    xa1 = _mm_loadu_pd(c_pointer);\
-    xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\
-    _mm_storeu_pd(c_pointer,xc1);\
-    c_pointer += 2;\
-}
-#define SAVE_m2n2 {\
-    xa1 = _mm_loaddup_pd(alpha);\
-    xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\
-    xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\
-    _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\
-    c_pointer += 2;\
-}
-#define KERNEL_m1n1k1 {\
-    sa1 = *a_block_pointer; a_block_pointer++;\
-    sb1 = *b_block_pointer; sc1 += sa1 * sb1;\
-    b_block_pointer ++;\
-}
-#define INIT_m1n1 sc1=0.0;
-#define SAVE_m1n1 {\
-    *c_pointer += sc1 * (*alpha);\
-    c_pointer++;\
-}
-/* row-major c_block */
-#define KERNEL_m2n4k1 {\
-    yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
-    ya1 = _mm256_broadcast_sd(a_block_pointer);  yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
-    ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\
-    a_block_pointer += 2;\
-}
-#define KERNEL_m1n4k1 {\
-    yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
-    ya1 = _mm256_broadcast_sd(a_block_pointer);  yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
-    a_block_pointer ++;\
-}
-#define KERNEL_m1n2k1 {\
-    xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\
-    xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
-    a_block_pointer ++;\
-}
-#define INIT_m1n2 INIT_m2n1
-#define INIT_m1n4 INIT_m4n1
-#define INIT_m2n4 INIT_m4n2
-#define SAVE_m2n4 {\
-    ya1 = _mm256_broadcast_sd(alpha);\
-    yc1 = _mm256_mul_pd(yc1,ya1);\
-    yc2 = _mm256_mul_pd(yc2,ya1);\
-    yb1 = _mm256_unpacklo_pd(yc1,yc2);\
-    yb2 = _mm256_unpackhi_pd(yc1,yc2);\
-    xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\
-    xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\
-    _mm_storeu_pd(c_pointer,xb1);\
-    _mm_storeu_pd(c_pointer+LDC,xb2);\
-    xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\
-    xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\
-    _mm_storeu_pd(c_pointer+2*LDC,xb1);\
-    _mm_storeu_pd(c_pointer+3*LDC,xb2);\
-    c_pointer += 2;\
-}
-#define SAVE_m1n2 {\
-    xb1 = _mm_loaddup_pd(alpha);\
-    xc1 = _mm_mul_pd(xc1,xb1);\
-    *c_pointer += _mm_cvtsd_f64(xc1);\
-    xa1 = _mm_unpackhi_pd(xc1,xc1);\
-    c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\
-    c_pointer ++;\
-}
-#define SAVE_m1n4 {\
-    ya1 = _mm256_broadcast_sd(alpha);\
-    yc1 = _mm256_mul_pd(yc1,ya1);\
-    xb1 = _mm256_extractf128_pd(yc1,0);\
-    *c_pointer += _mm_cvtsd_f64(xb1);\
-    xb2 = _mm_unpackhi_pd(xb1,xb1);\
-    c_pointer[LDC] += _mm_cvtsd_f64(xb2);\
-    xb1 = _mm256_extractf128_pd(yc1,1);\
-    c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\
-    xb2 = _mm_unpackhi_pd(xb1,xb1);\
-    c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\
-    c_pointer ++;\
-}
-static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
-//perform C += A<pack> B<pack> , edge_n<8 must be satisfied.
-    if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return;
-    double *a_block_pointer,*b_block_pointer,*b_base_pointer;
-    double *c_pointer = c;
-    __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2;
-    __m128d xc1,xc2,xa1,xb1,xb2;
-    double sc1,sa1,sb1;
-    BLASLONG m_count,n_count,k_count;
-    b_base_pointer = packed_b;
-//now start calculation of the edge part
-    for(n_count=edge_n;n_count>3;n_count-=4){
-      a_block_pointer = packed_a;
-      for(m_count=m;m_count>3;m_count-=4){
-        b_block_pointer = b_base_pointer;
-        INIT_m4n4
-        for(k_count=0;k_count<k;k_count++) KERNEL_m4n4k1
-        SAVE_m4n4
-      }
-      for(;m_count>1;m_count-=2){
-        b_block_pointer = b_base_pointer;
-        INIT_m2n4
-        for(k_count=0;k_count<k;k_count++) KERNEL_m2n4k1
-        SAVE_m2n4
-      }
-      if(m_count>0){
-        b_block_pointer = b_base_pointer;
-        INIT_m1n4
-        for(k_count=0;k_count<k;k_count++) KERNEL_m1n4k1
-        SAVE_m1n4
-      }
-      b_base_pointer += 4*k;
-      c_pointer += 4 * LDC - m;
-    }
-    for(;n_count>1;n_count-=2){
-      a_block_pointer = packed_a;
-      for(m_count=m;m_count>3;m_count-=4){
-        b_block_pointer = b_base_pointer;
-        INIT_m4n2
-        for(k_count=0;k_count<k;k_count++) KERNEL_m4n2k1
-        SAVE_m4n2
-      }
-      for(;m_count>1;m_count-=2){
-        b_block_pointer = b_base_pointer;
-        INIT_m2n2
-        for(k_count=0;k_count<k;k_count++) KERNEL_m2n2k1
-        SAVE_m2n2
-      }
-      if(m_count>0){
-        b_block_pointer = b_base_pointer;
-        INIT_m1n2
-        for(k_count=0;k_count<k;k_count++) KERNEL_m1n2k1
-        SAVE_m1n2
-      }
-      b_base_pointer += 2*k;
-      c_pointer += 2 * LDC - m;
-    }
-    if(n_count>0){
-      a_block_pointer = packed_a;
-      for(m_count=m;m_count>3;m_count-=4){
-        b_block_pointer = b_base_pointer;
-        INIT_m4n1
-        for(k_count=0;k_count<k;k_count++) KERNEL_m4n1k1
-        SAVE_m4n1
-      }
-      for(;m_count>1;m_count-=2){
-        b_block_pointer = b_base_pointer;
-        INIT_m2n1
-        for(k_count=0;k_count<k;k_count++) KERNEL_m2n1k1
-        SAVE_m2n1
-      }
-      if(m_count>0){
-        b_block_pointer = b_base_pointer;
-        INIT_m1n1
-        for(k_count=0;k_count<k;k_count++) KERNEL_m1n1k1
-        SAVE_m1n1
-      }
-    }
-}
-int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
-    if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
-    BLASLONG ndiv8 = n/8;double ALPHA = alpha;
-    double *packed_a = A;
-    if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
-    if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
-    return 0;
-}
+#include "common.h"
+#include <stdint.h>
+#include <immintrin.h>
+
+//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
+
+/* row-major c_block */
+#define INNER_KERNEL_k1m1n8 \
+    "prefetcht0 384(%1);"\
+    "vmovupd (%1),%%zmm5; addq $64,%1;"\
+    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;"
+
+#define INNER_KERNEL_k1m2n8 \
+    INNER_KERNEL_k1m1n8\
+    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
+
+#define INNER_KERNEL_k1m1n16 \
+    "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
+    "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
+    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
+
+#define INNER_KERNEL_k1m2n16 \
+    INNER_KERNEL_k1m1n16\
+    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
+
+#define INNER_KERNEL_k1m1n24 \
+    "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
+    "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
+    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
+
+#define INNER_KERNEL_k1m2n24 \
+    INNER_KERNEL_k1m1n24\
+    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
+
+/* row-major z-partition c_block */
+#define INNER_KERNEL_k1m4n8 \
+    "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\
+    "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\
+    "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;"
+
+#define INNER_KERNEL_k1m4n16 \
+    INNER_KERNEL_k1m4n8\
+    "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\
+    "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;"
+
+#define INNER_KERNEL_k1m4n24 \
+    INNER_KERNEL_k1m4n16\
+    "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\
+    "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;"
+
+#define INNER_KERNEL_k1m8n8 \
+    "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\
+    "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\
+    "prefetcht0 128(%1);"\
+    "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\
+    "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\
+    "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\
+    "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;"
+
+#define INNER_KERNEL_k1m8n16 \
+    INNER_KERNEL_k1m8n8\
+    "prefetcht0 128(%1,%%r12,2);"\
+    "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\
+    "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\
+    "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\
+    "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;"
+
+#define INNER_KERNEL_k1m8n24 \
+    INNER_KERNEL_k1m8n16\
+    "prefetcht0 128(%1,%%r12,4);"\
+    "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\
+    "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\
+    "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\
+    "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;"
+
+/* micro kernels */
+#define INNER_KERNELm1(nn) \
+    "cmpq $1,%2;jb "#nn"3f;"\
+    #nn"4:\n\t"\
+    INNER_KERNEL_k1m1n##nn "addq $8,%0;"\
+    "decq %2;cmpq $1,%2;jnb "#nn"4b;"\
+    #nn"3:\n\t"
+
+#define INNER_KERNELm2(nn) \
+    "cmpq $1,%2;jb "#nn"0f;"\
+    #nn"1:\n\t"\
+    INNER_KERNEL_k1m2n##nn "addq $16,%0;"\
+    "decq %2;cmpq $1,%2;jnb "#nn"1b;"\
+    #nn"0:\n\t"
+
+#define INNER_KERNELm4(nn) \
+    "cmpq $1,%2;jb "#nn"00f;"\
+    #nn"01:\n\t"\
+    INNER_KERNEL_k1m4n##nn "addq $64,%1;"\
+    "decq %2;cmpq $1,%2;jnb "#nn"01b;"\
+    #nn"00:\n\t"
+
+/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
+#define INNER_KERNELm8(nn) \
+    "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\
+    #nn"008:\n\t"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    "prefetcht1 (%11); addq $32,%11;"\
+    "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\
+    "movq %3,%10;"\
+    #nn"001:\n\t"\
+    "cmpq $1,%2;jb "#nn"000f;"\
+    "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    "decq %2;jmp "#nn"001b;"\
+    ""#nn"000:\n\t"
+
+#define INNER_INIT_m1n8 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8;"
+
+#define INNER_INIT_m2n8 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;"
+
+#define INNER_INIT_m4n8 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
+
+#define INNER_INIT_m8n8 \
+    INNER_INIT_m4n8\
+    "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;"
+
+#define INNER_INIT_m1n16 INNER_INIT_m2n8
+
+#define INNER_INIT_m2n16 INNER_INIT_m4n8
+
+#define INNER_INIT_m4n16 INNER_INIT_m8n8
+
+#define INNER_INIT_m8n16 \
+    INNER_INIT_m8n8\
+    "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\
+    "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;"
+
+#define INNER_INIT_m1n24 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;"
+
+#define INNER_INIT_m2n24 \
+    INNER_INIT_m1n24\
+    "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;"
+
+#define INNER_INIT_m4n24 \
+    INNER_INIT_m4n16\
+    "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"
+
+#define INNER_INIT_m8n24 \
+    INNER_INIT_m8n16\
+    "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\
+    "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;"
+
+#define INNER_SETINDEX \
+    "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\
+    "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"
+
+#define INNER_STORE_m1n8(c1,disp) \
+    "kxnorw %%k1,%%k1,%%k1;"\
+    "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\
+    "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
+    "kxnorw %%k1,%%k1,%%k1;"\
+    "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};"
+
+#define INNER_SAVE_m1n8 \
+    "movq %3,%10;"\
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)
+
+#define INNER_SAVE_m1n16 \
+    INNER_SAVE_m1n8\
+    "leaq (%10,%4,8),%10;"\
+    INNER_STORE_m1n8(%%zmm9,0)
+
+#define INNER_SAVE_m1n24 \
+    INNER_SAVE_m1n16\
+    "leaq (%10,%4,8),%10;"\
+    INNER_STORE_m1n8(%%zmm10,0)
+
+#define INNER_SAVE_m2n8 \
+    "movq %3,%10;"\
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)\
+    INNER_STORE_m1n8(%%zmm9,8)
+
+#define INNER_SAVE_m2n16 \
+    "movq %3,%10;"\
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)\
+    INNER_STORE_m1n8(%%zmm10,8)\
+    "leaq (%10,%4,8),%10;"\
+    INNER_STORE_m1n8(%%zmm9,0)\
+    INNER_STORE_m1n8(%%zmm11,8)
+
+#define INNER_SAVE_m2n24 \
+    "movq %3,%10;"\
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)\
+    INNER_STORE_m1n8(%%zmm11,8)\
+    "leaq (%10,%4,8),%10;"\
+    INNER_STORE_m1n8(%%zmm9,0)\
+    INNER_STORE_m1n8(%%zmm12,8)\
+    "leaq (%10,%4,8),%10;"\
+    INNER_STORE_m1n8(%%zmm10,0)\
+    INNER_STORE_m1n8(%%zmm13,8)
+
+#define INNER_TRANS_4x8(c1,c2,c3,c4) \
+    "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\
+    "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\
+    "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\
+    "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\
+
+#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \
+    "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\
+    "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\
+    "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\
+    "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};"
+
+#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
+    INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8)
+
+//%7 for k01(input) only when m=4
+#define INNER_STORE_4x8(c1,c2,c3,c4) \
+    "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
+    "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
+    "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
+    "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
+    "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
+    "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
+    "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
+    "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
+    "leaq (%10,%4,4),%10;"
+
+#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
+    "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\
+    "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\
+    "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\
+    "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;"
+
+#define INNER_SAVE_m4n8 \
+    "movq %3,%10;"\
+    INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
+    INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
+
+#define INNER_SAVE_m4n16 \
+    INNER_SAVE_m4n8\
+    INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
+    INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
+
+#define INNER_SAVE_m4n24 \
+    INNER_SAVE_m4n16\
+    INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\
+    INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
+
+#define INNER_SAVE_m8n8 \
+    "movq %3,%10;"\
+    INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
+    INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
+
+#define INNER_SAVE_m8n16 \
+    INNER_SAVE_m8n8\
+    INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\
+    INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)
+
+#define INNER_SAVE_m8n24 \
+    INNER_SAVE_m8n16\
+    INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\
+    INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)
+
+#define COMPUTE_n8 {\
+    b_pref = packed_b_pointer + 8 * K;\
+    __asm__ __volatile__(\
+    "vbroadcastsd (%9),%%zmm3;"\
+    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
+    "cmpq $8,%8; jb 42222f;"\
+    "42221:\n\t"\
+    INNER_INIT_m8n8\
+    INNER_KERNELm8(8)\
+    INNER_SAVE_m8n8\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
+    "addq $64,%3;"\
+    "subq $8,%8; cmpq $8,%8; jnb 42221b;"\
+    "42222:\n\t"\
+    "cmpq $4,%8; jb 42223f;"\
+    INNER_INIT_m4n8\
+    INNER_KERNELm4(8)\
+    INNER_SAVE_m4n8\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $32,%3;"\
+    "subq $4,%8;"\
+    "42223:\n\t"\
+    "cmpq $2,%8; jb 42224f;"\
+    INNER_INIT_m2n8\
+    INNER_KERNELm2(8)\
+    INNER_SAVE_m2n8\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $16,%3;"\
+    "subq $2,%8;"\
+    "42224:\n\t"\
+    "cmpq $1,%8; jb 42225f;"\
+    INNER_INIT_m1n8\
+    INNER_KERNELm1(8)\
+    INNER_SAVE_m1n8\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $8,%3;"\
+    "42225:\n\t"\
+    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
+    "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
+    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
+    ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
+    a_block_pointer -= M * K;\
+}
+#define COMPUTE_n16 {\
+    b_pref = packed_b_pointer + 16 * K;\
+    __asm__ __volatile__(\
+    "vbroadcastsd (%9),%%zmm3;"\
+    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
+    "cmpq $8,%8; jb 32222f;"\
+    "32221:\n\t"\
+    INNER_INIT_m8n16\
+    INNER_KERNELm8(16)\
+    INNER_SAVE_m8n16\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
+    "addq $64,%3;"\
+    "subq $8,%8; cmpq $8,%8; jnb 32221b;"\
+    "32222:\n\t"\
+    "cmpq $4,%8; jb 32223f;"\
+    INNER_INIT_m4n16\
+    INNER_KERNELm4(16)\
+    INNER_SAVE_m4n16\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $32,%3;"\
+    "subq $4,%8;"\
+    "32223:\n\t"\
+    "cmpq $2,%8; jb 32224f;"\
+    INNER_INIT_m2n16\
+    INNER_KERNELm2(16)\
+    INNER_SAVE_m2n16\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $16,%3;"\
+    "subq $2,%8;"\
+    "32224:\n\t"\
+    "cmpq $1,%8; jb 32225f;"\
+    INNER_INIT_m1n16\
+    INNER_KERNELm1(16)\
+    INNER_SAVE_m1n16\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $8,%3;"\
+    "32225:\n\t"\
+    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
+    "shlq $4,%4;addq %4,%3;shrq $4,%4;"\
+    "leaq (%1,%%r12,4),%1;"\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
+    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
+    ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
+    "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
+    a_block_pointer -= M * K;\
+}
+#define COMPUTE_n24 {\
+    b_pref = packed_b_pointer + 24 * K;\
+    __asm__ __volatile__(\
+    "vbroadcastsd (%9),%%zmm3;"\
+    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
+    "cmpq $8,%8; jb 22222f;"\
+    "22221:\n\t"\
+    INNER_INIT_m8n24\
+    INNER_KERNELm8(24)\
+    INNER_SAVE_m8n24\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
+    "addq $64,%3;"\
+    "subq $8,%8; cmpq $8,%8; jnb 22221b;"\
+    "22222:\n\t"\
+    "cmpq $4,%8; jb 22223f;"\
+    INNER_INIT_m4n24\
+    INNER_KERNELm4(24)\
+    INNER_SAVE_m4n24\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $32,%3;"\
+    "subq $4,%8;"\
+    "22223:\n\t"\
+    "cmpq $2,%8; jb 22224f;"\
+    INNER_INIT_m2n24\
+    INNER_KERNELm2(24)\
+    INNER_SAVE_m2n24\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $16,%3;"\
+    "subq $2,%8;"\
+    "22224:\n\t"\
+    "cmpq $1,%8; jb 22225f;"\
+    INNER_INIT_m1n24\
+    INNER_KERNELm1(24)\
+    INNER_SAVE_m1n24\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $8,%3;"\
+    "22225:\n\t"\
+    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
+    "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
+    "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
+    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\
+    "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\
+    "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
+    a_block_pointer -= M * K;\
+}
+static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8
+//perform C += A<pack> B<pack>
+    if(k==0 || m==0 || ndiv8==0) return;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
+    int64_t K = (int64_t)k; int64_t M = (int64_t)m;
+    double *a_block_pointer,*b_pref;
+    double *c_pointer = c,*c_store = c;
+    __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
+    BLASLONG ndiv8_count;
+    double *packed_b_pointer = packed_b;
+    a_block_pointer = packed_a;
+    for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
+      COMPUTE_n24
+    }
+    for(;ndiv8_count>1;ndiv8_count-=2){
+      COMPUTE_n16
+    }
+    if(ndiv8_count>0){
+      COMPUTE_n8
+    }
+}
+
+/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */
+/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */
+/*  double accumulator:  sc1;     temporary variables: sa1,sb1 */
+/* column-major c_block */
+#define KERNEL_m4n4k1 {\
+    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
+    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
+    yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\
+    yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\
+    b_block_pointer+=4;\
+}
+#define KERNEL_m4n2k1 {\
+    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
+    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
+    b_block_pointer+=2;\
+}
+#define KERNEL_m4n1k1 {\
+    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
+    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    b_block_pointer++;\
+}
+#define INIT_m4n1 yc1=_mm256_setzero_pd();
+#define INIT_m4n2 yc2=INIT_m4n1
+#define INIT_m4n4 yc4=yc3=INIT_m4n2
+#define SAVE_m4n1 {\
+    yb1 = _mm256_broadcast_sd(alpha);\
+    ya1 = _mm256_loadu_pd(c_pointer);\
+    yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\
+    _mm256_storeu_pd(c_pointer,yc1);\
+    c_pointer += 4;\
+}
+#define SAVE_m4n2 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
+    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
+    yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
+    _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
+    c_pointer += 4;\
+}
+#define SAVE_m4n4 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
+    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
+    yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
+    _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
+    c_pointer += LDC*2;\
+    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
+    yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\
+    _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\
+    c_pointer += 4-LDC*2;\
+}
+#define KERNEL_m2n2k1 {\
+    xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
+    xb1 = _mm_loaddup_pd(b_block_pointer);   xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
+    xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\
+    b_block_pointer += 2;\
+}
+#define KERNEL_m2n1k1 {\
+    xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
+    xb1 = _mm_loaddup_pd(b_block_pointer);   xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
+    b_block_pointer ++;\
+}
+#define INIT_m2n1 xc1=_mm_setzero_pd();
+#define INIT_m2n2 xc2=INIT_m2n1
+#define SAVE_m2n1 {\
+    xb1 = _mm_loaddup_pd(alpha);\
+    xa1 = _mm_loadu_pd(c_pointer);\
+    xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\
+    _mm_storeu_pd(c_pointer,xc1);\
+    c_pointer += 2;\
+}
+#define SAVE_m2n2 {\
+    xa1 = _mm_loaddup_pd(alpha);\
+    xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\
+    xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\
+    _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\
+    c_pointer += 2;\
+}
+#define KERNEL_m1n1k1 {\
+    sa1 = *a_block_pointer; a_block_pointer++;\
+    sb1 = *b_block_pointer; sc1 += sa1 * sb1;\
+    b_block_pointer ++;\
+}
+#define INIT_m1n1 sc1=0.0;
+#define SAVE_m1n1 {\
+    *c_pointer += sc1 * (*alpha);\
+    c_pointer++;\
+}
+/* row-major c_block */
+#define KERNEL_m2n4k1 {\
+    yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
+    ya1 = _mm256_broadcast_sd(a_block_pointer);  yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\
+    a_block_pointer += 2;\
+}
+#define KERNEL_m1n4k1 {\
+    yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
+    ya1 = _mm256_broadcast_sd(a_block_pointer);  yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    a_block_pointer ++;\
+}
+#define KERNEL_m1n2k1 {\
+    xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\
+    xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
+    a_block_pointer ++;\
+}
+#define INIT_m1n2 INIT_m2n1
+#define INIT_m1n4 INIT_m4n1
+#define INIT_m2n4 INIT_m4n2
+#define SAVE_m2n4 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
+    yc1 = _mm256_mul_pd(yc1,ya1);\
+    yc2 = _mm256_mul_pd(yc2,ya1);\
+    yb1 = _mm256_unpacklo_pd(yc1,yc2);\
+    yb2 = _mm256_unpackhi_pd(yc1,yc2);\
+    xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\
+    xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\
+    _mm_storeu_pd(c_pointer,xb1);\
+    _mm_storeu_pd(c_pointer+LDC,xb2);\
+    xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\
+    xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\
+    _mm_storeu_pd(c_pointer+2*LDC,xb1);\
+    _mm_storeu_pd(c_pointer+3*LDC,xb2);\
+    c_pointer += 2;\
+}
+#define SAVE_m1n2 {\
+    xb1 = _mm_loaddup_pd(alpha);\
+    xc1 = _mm_mul_pd(xc1,xb1);\
+    *c_pointer += _mm_cvtsd_f64(xc1);\
+    xa1 = _mm_unpackhi_pd(xc1,xc1);\
+    c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\
+    c_pointer ++;\
+}
+#define SAVE_m1n4 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
+    yc1 = _mm256_mul_pd(yc1,ya1);\
+    xb1 = _mm256_extractf128_pd(yc1,0);\
+    *c_pointer += _mm_cvtsd_f64(xb1);\
+    xb2 = _mm_unpackhi_pd(xb1,xb1);\
+    c_pointer[LDC] += _mm_cvtsd_f64(xb2);\
+    xb1 = _mm256_extractf128_pd(yc1,1);\
+    c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\
+    xb2 = _mm_unpackhi_pd(xb1,xb1);\
+    c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\
+    c_pointer ++;\
+}
+static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
+//perform C += A<pack> B<pack> , edge_n<8 must be satisfied.
+    if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return;
+    double *a_block_pointer,*b_block_pointer,*b_base_pointer;
+    double *c_pointer = c;
+    __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2;
+    __m128d xc1,xc2,xa1,xb1,xb2;
+    double sc1,sa1,sb1;
+    BLASLONG m_count,n_count,k_count;
+    b_base_pointer = packed_b;
+//now start calculation of the edge part
+    for(n_count=edge_n;n_count>3;n_count-=4){
+      a_block_pointer = packed_a;
+      for(m_count=m;m_count>3;m_count-=4){
+        b_block_pointer = b_base_pointer;
+        INIT_m4n4
+        for(k_count=0;k_count<k;k_count++) KERNEL_m4n4k1
+        SAVE_m4n4
+      }
+      for(;m_count>1;m_count-=2){
+        b_block_pointer = b_base_pointer;
+        INIT_m2n4
+        for(k_count=0;k_count<k;k_count++) KERNEL_m2n4k1
+        SAVE_m2n4
+      }
+      if(m_count>0){
+        b_block_pointer = b_base_pointer;
+        INIT_m1n4
+        for(k_count=0;k_count<k;k_count++) KERNEL_m1n4k1
+        SAVE_m1n4
+      }
+      b_base_pointer += 4*k;
+      c_pointer += 4 * LDC - m;
+    }
+    for(;n_count>1;n_count-=2){
+      a_block_pointer = packed_a;
+      for(m_count=m;m_count>3;m_count-=4){
+        b_block_pointer = b_base_pointer;
+        INIT_m4n2
+        for(k_count=0;k_count<k;k_count++) KERNEL_m4n2k1
+        SAVE_m4n2
+      }
+      for(;m_count>1;m_count-=2){
+        b_block_pointer = b_base_pointer;
+        INIT_m2n2
+        for(k_count=0;k_count<k;k_count++) KERNEL_m2n2k1
+        SAVE_m2n2
+      }
+      if(m_count>0){
+        b_block_pointer = b_base_pointer;
+        INIT_m1n2
+        for(k_count=0;k_count<k;k_count++) KERNEL_m1n2k1
+        SAVE_m1n2
+      }
+      b_base_pointer += 2*k;
+      c_pointer += 2 * LDC - m;
+    }
+    if(n_count>0){
+      a_block_pointer = packed_a;
+      for(m_count=m;m_count>3;m_count-=4){
+        b_block_pointer = b_base_pointer;
+        INIT_m4n1
+        for(k_count=0;k_count<k;k_count++) KERNEL_m4n1k1
+        SAVE_m4n1
+      }
+      for(;m_count>1;m_count-=2){
+        b_block_pointer = b_base_pointer;
+        INIT_m2n1
+        for(k_count=0;k_count<k;k_count++) KERNEL_m2n1k1
+        SAVE_m2n1
+      }
+      if(m_count>0){
+        b_block_pointer = b_base_pointer;
+        INIT_m1n1
+        for(k_count=0;k_count<k;k_count++) KERNEL_m1n1k1
+        SAVE_m1n1
+      }
+    }
+}
+int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
+    if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
+    BLASLONG ndiv8 = n/8;double ALPHA = alpha;
+    double *packed_a = A;
+    if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
+    if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
+    return 0;
+}
diff --git a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
index 40c5892c6..c353a5913 100644
--- a/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
+++ b/kernel/x86_64/dgemm_kernel_8x2_bulldozer.S
@@ -1,4413 +1,4413 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-/*********************************************************************
-* 2013/06/02 Saar
-*
-* Parameter:
-* 	UNROLL_M	8
-*	UNROLL_N	2
-*	DGEMM_P		360
-*	DGEMM_Q		160
-*
-* Performance at m x n without prefetch of BO:
-* 
-* 5760x5760	93.4	GFLOPS with 8 threads on 4 modules (ACML: 90.8 GFLOPS)
-* 5760x5760	84.2	GFLOPS with 4 threads on 4 modules (ACML: 82.4 GFLOPS)
-* 3840x3840	50.3	GFLOPS with 2 threads on 2 modules (ACML: 49.5 GFLOPS)
-*
-* 5760x5760	56.4	GFLOPS with 4 threads on 2 modules (ACML: 58.5 GFLOPS)
-* 3840x3840	29.0	GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS)
-* 3840x3840	26.1	GFLOPS with 1 threads on 1 modules (ACML: 25.9 GFLOPS)
-*
-*********************************************************************/
-
-/*********************************************************************
-* 2013/06/03 Saar
-*
-* Parameter:
-* 	UNROLL_M	8
-*	UNROLL_N	2
-*	DGEMM_P		336
-*	DGEMM_Q		168
-*	NO_WARMUP	1
-*	NO_AFFINITY	1
-*	GEMM_MULTITHREAD_THRESHOLD 4
-*
-* Performance at m x n with prefetch of BO:
-* 
-* 8064x3840	93.7	GFLOPS with 8 threads on 4 modules (ACML: 93.6 GFLOPS)
-* 6048x2880	85.1	GFLOPS with 4 threads on 4 modules (ACML: 84.2 GFLOPS)
-* 6048x2880	52.0	GFLOPS with 2 threads on 2 modules (ACML: 50.0 GFLOPS)
-*
-* 6048x2880	56.3	GFLOPS with 4 threads on 2 modules (ACML: 57.6 GFLOPS)
-* 4032x1920	29.5	GFLOPS with 2 threads on 1 modules (ACML: 30.5 GFLOPS)
-* 4032x1920	26.9	GFLOPS with 1 threads on 1 modules (ACML: 26.1 GFLOPS)
-*
-*********************************************************************/
-
-/*********************************************************************
-* 2013/06/04 Saar
-*
-* Parameter:
-* 	UNROLL_M	8
-*	UNROLL_N	2
-*	DGEMM_P		384
-*	DGEMM_Q		168
-*	NO_WARMUP	1
-*	NO_AFFINITY	1
-*	GEMM_MULTITHREAD_THRESHOLD 4
-*
-* Performance at m x n with prefetch of BO:
-* 
-* 6144x5376	94.6	GFLOPS with 8 threads on 4 modules (ACML: 90.5 GFLOPS)
-* 6144x5376	86.0	GFLOPS with 4 threads on 4 modules (ACML: 81.5 GFLOPS)
-* 4608x4032	52.0	GFLOPS with 2 threads on 2 modules (ACML: 47.5 GFLOPS)
-*
-* 6144x5376	57.3	GFLOPS with 4 threads on 2 modules (ACML: 56.5 GFLOPS)
-* 4608x4032	29.6	GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS)
-* 4608x4032	26.9	GFLOPS with 1 threads on 1 modules (ACML: 25.6 GFLOPS)
-*
-*********************************************************************/
-
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 256
-
-#define OLD_A		40 + STACKSIZE(%rsp)
-#define OLD_B		48 + STACKSIZE(%rsp)
-#define OLD_C		56 + STACKSIZE(%rsp)
-#define OLD_LDC		64 + STACKSIZE(%rsp)
-#define OLD_OFFSET	72 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-#define LB2_OFFSET    4096
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA	 48(%rsp)
-#define OFFSET	 56(%rsp)
-#define KK	 64(%rsp)
-#define KKK	 72(%rsp)
-#define BUFFER1	           128(%rsp)
-#define BUFFER2	LB2_OFFSET+128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-
-#define	A_PR1	384
-#define	B_PR1	192
-
-#define KERNEL8x3_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,8)	;\
-	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddpd  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vfmaddpd  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL8x3_2(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,8)	;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddpd  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vfmaddpd  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL8x3_3(xx) \
-	prefetcht0	A_PR1+128(AO,%rax,8)	;\
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	  0 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	  2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	  4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddpd  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	  6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vfmaddpd  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL8x3_4(xx) \
-	prefetcht0	A_PR1+192(AO,%rax,8)	;\
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	  8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  4 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	  5 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	 10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	 12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddpd  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	 14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vfmaddpd  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-	addq	$12, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x3_SUB(xx) \
-	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddpd  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vfmaddpd  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x3_1(xx) \
-	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL4x3_2(xx) \
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL4x3_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL4x3_4(xx) \
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  4 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	  5 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	addq	$12, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x3_SUB(xx) \
-	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-
-
-
-
-/*******************************************************************************************/
-
-#define KERNEL2x3_1(xx) \
-	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL2x3_2(xx) \
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL2x3_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL2x3_4(xx) \
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  4 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	  5 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	addq	$12, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x3_SUB(xx) \
-	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x3_1(xx) \
-	vmovsd	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_2(xx) \
-	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_3(xx) \
-	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovsd	  2 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_4(xx) \
-	vmovsd	  3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	  4 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovsd	  5 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	addq	$12, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x3_SUB(xx) \
-	vmovsd	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-
-
-/*******************************************************************************************
-* 2 lines of N
-*******************************************************************************************/
-
-#define KERNEL8x2_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,8)	;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL8x2_2(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,8)	;\
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL8x2_3(xx) \
-	prefetcht0	A_PR1+128(AO,%rax,8)	;\
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	  0 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	  2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	  4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	  6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL8x2_4(xx) \
-	prefetcht0	A_PR1+192(AO,%rax,8)	;\
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	  8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	 12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	 14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	addq	$8, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x2_SUB(xx) \
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x2_1(xx) \
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL4x2_2(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL4x2_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL4x2_4(xx) \
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	addq	$8, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x2_SUB(xx) \
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL2x2_1(xx) \
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL2x2_2(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL2x2_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL2x2_4(xx) \
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	addq	$8, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x2_SUB(xx) \
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x2_1(xx) \
-	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_2(xx) \
-	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_3(xx) \
-	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_4(xx) \
-	vmovsd	  2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	  3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	addq	$8, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x2_SUB(xx) \
-	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-
-
-/*******************************************************************************************
-* 1 line of N
-*******************************************************************************************/
-
-#define KERNEL8x1_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,8)	;\
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL8x1_2(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,8)	;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL8x1_3(xx) \
-	prefetcht0	A_PR1+128(AO,%rax,8)	;\
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	  0 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	  2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	  4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	  6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL8x1_4(xx) \
-	prefetcht0	A_PR1+192(AO,%rax,8)	;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	  8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	 12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	 14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	addq	$4, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x1_SUB(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x1_1(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL4x1_2(xx) \
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL4x1_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL4x1_4(xx) \
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	addq	$4, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x1_SUB(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL2x1_1(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL2x1_2(xx) \
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL2x1_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL2x1_4(xx) \
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	addq	$4, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x1_SUB(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x1_1(xx) \
-	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_2(xx) \
-	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_3(xx) \
-	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_4(xx) \
-	vmovsd	  1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	addq	$4, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x1_SUB(xx) \
-	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-
-/*******************************************************************************************/
-
-#if !defined(TRMMKERNEL)
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $6,  %rdi
-        divq    %rdi                    //    N / 6
-        movq    %rax, Ndiv6             //    N / 6
-        movq    %rdx, Nmod6             //    N % 6
-
-	
-
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L2_0
-	ALIGN_4
-
-.L6_01:
-        // copy to sub buffer
-        movq    K, %rax
-        salq    $1,%rax                 // K * 2
-        movq    B, BO1
-        leaq    (B,%rax,8), BO2         // next offset to BO2
-        leaq    BUFFER1, BO             // first buffer to BO
-        movq    K, %rax
-        sarq    $2, %rax                // K / 4
-        jz      .L6_02a
-        ALIGN_4
-
-.L6_02:
-	prefetcht0 512(BO1)
-	prefetcht0 512(BO2)
-	prefetchw  512(BO)
-	vmovups	      (BO1), %xmm0
-	vmovups	2*SIZE(BO1), %xmm2
-	vmovups	4*SIZE(BO1), %xmm4
-	vmovups	6*SIZE(BO1), %xmm6
-	vmovsd        (BO2), %xmm1
-	vmovsd  2*SIZE(BO2), %xmm3
-	vmovsd  4*SIZE(BO2), %xmm5
-	vmovsd  6*SIZE(BO2), %xmm7
-	vmovups	%xmm0,       (BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	vmovups	%xmm2, 3*SIZE(BO)
-	vmovsd	%xmm3, 5*SIZE(BO)
-	vmovups	%xmm4, 6*SIZE(BO)
-	vmovsd	%xmm5, 8*SIZE(BO)
-	vmovups	%xmm6, 9*SIZE(BO)
-	vmovsd	%xmm7,11*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-	decq	%rax
-	jnz	.L6_02
-
-.L6_02a:
-
-	movq	K, %rax
-	andq	$3, %rax		// K % 4
-	jz	.L6_02c
-	ALIGN_4
-
-.L6_02b:
-
-	vmovups	(BO1), %xmm0
-	vmovsd  (BO2), %xmm1
-	vmovups	%xmm0,       (BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO2
-	addq	$3*SIZE,BO
-	decq	%rax
-	jnz	.L6_02b
-
-.L6_02c:
-
-	movq	K, %rax
-	salq	$1,%rax			// K * 2
-	leaq	(B,%rax,8), BO1		// next offset to BO1
-	leaq	(BO1,%rax,8), BO2	// next offset to BO1
-	leaq    BUFFER2, BO		// second buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// k / 4
-	jz	.L6_03a
-	ALIGN_4
-
-
-.L6_03:
-
-	prefetcht0 512(BO2)
-	prefetchw  512(BO)
-	vmovups	      (BO2), %xmm0
-	vmovups	2*SIZE(BO2), %xmm2
-	vmovups	4*SIZE(BO2), %xmm4
-	vmovups	6*SIZE(BO2), %xmm6
-	vmovsd  1*SIZE(BO1), %xmm1
-	vmovsd  3*SIZE(BO1), %xmm3
-	vmovsd  5*SIZE(BO1), %xmm5
-	vmovsd  7*SIZE(BO1), %xmm7
-	vmovsd	%xmm1, 0*SIZE(BO)
-	vmovups	%xmm0, 1*SIZE(BO)
-	vmovsd	%xmm3, 3*SIZE(BO)
-	vmovups	%xmm2, 4*SIZE(BO)
-	vmovsd	%xmm5, 6*SIZE(BO)
-	vmovups	%xmm4, 7*SIZE(BO)
-	vmovsd	%xmm7, 9*SIZE(BO)
-	vmovups	%xmm6,10*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-	decq	%rax
-	jnz	.L6_03
-
-.L6_03a:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L6_03c
-        ALIGN_4
-
-
-.L6_03b:
-
-	vmovsd	  1*SIZE(BO1), %xmm0
-	vmovups  	(BO2), %xmm1
-	vmovsd	%xmm0,       (BO)
-	vmovups %xmm1, 1*SIZE(BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO2
-	addq	$3*SIZE,BO
-	decq	%rax
-	jnz	.L6_03b
-
-
-.L6_03c:
-
-	movq	BO2, B			// next offset of B
-
-.L6_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		 
-	leaq	(C, LDC, 1), C		// c += 3 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L6_20
-
-	ALIGN_4
-
-.L6_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L6_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_12:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L6_16
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L6_16
-
-	jmp	.L6_12
-	ALIGN_4
-
-.L6_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_17:
-
-	KERNEL8x3_SUB(xxx)
-	addq	$3, BI
-	addq	$8, %rax
-	jl	.L6_17
-	ALIGN_4
-
-
-.L6_19:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-	vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-	vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
-	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm12, 4 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm15, 6 * SIZE(CO1, LDC, 2)
-
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L6_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L6_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L7_10		// to next 3 lines of N
-
-	testq	$4, M		
-	jz	.L6_30
-
-	ALIGN_4
-
-.L6_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_22:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L6_26
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L6_26
-
-	jmp	.L6_22
-	ALIGN_4
-
-.L6_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_27:
-
-	KERNEL4x3_SUB(xxx)
-	addq	$3, BI
-	addq	$4, %rax
-	jl	.L6_27
-	ALIGN_4
-
-
-.L6_29:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
-
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L6_30:
-	testq	$2, M		
-	jz	.L6_40
-
-	ALIGN_4
-
-.L6_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_32:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L6_36
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L6_36
-
-	jmp	.L6_32
-	ALIGN_4
-
-.L6_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_37:
-
-	KERNEL2x3_SUB(xxx)
-	addq	$3, BI
-	addq	$2, %rax
-	jl	.L6_37
-	ALIGN_4
-
-
-.L6_39:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L6_40:
-	testq	$1, M		
-	jz	.L7_10		// to next 3 lines of N
-
-	ALIGN_4
-
-.L6_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_42:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L6_46
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L6_46
-
-	jmp	.L6_42
-	ALIGN_4
-
-.L6_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_47:
-
-	KERNEL1x3_SUB(xxx)
-	addq	$3, BI
-	addq	$1, %rax
-	jl	.L6_47
-	ALIGN_4
-
-
-.L6_49:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddsd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
-
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-/***************************************************************************************************************/
-
-.L7_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		 
-	leaq	(C, LDC, 1), C		// c += 3 * ldc
-
-
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L7_20
-	ALIGN_4
-
-.L7_11:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-
-	vzeroall
-
-        movq    K, %rax
-
-
-	andq	$-8, %rax
-	je	.L7_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-
-	ALIGN_4
-
-.L7_12:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L7_16
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L7_16
-
-	jmp	.L7_12
-	ALIGN_4
-
-.L7_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_17:
-
-	KERNEL8x3_SUB(xxx)
-	addq	$3, BI
-	addq	$8, %rax
-	jl	.L7_17
-	ALIGN_4
-
-
-.L7_19:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-	vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-	vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
-	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm12, 4 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm15, 6 * SIZE(CO1, LDC, 2)
-
-
-
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L7_11
-	ALIGN_4	
-
-.L7_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L7_60		// to next 6 lines of N
-
-	testq	$4, M		
-	jz	.L7_30
-
-	ALIGN_4
-
-.L7_21:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_22:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L7_26
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L7_26
-
-	jmp	.L7_22
-	ALIGN_4
-
-.L7_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_27:
-
-	KERNEL4x3_SUB(xxx)
-	addq	$3, BI
-	addq	$4, %rax
-	jl	.L7_27
-	ALIGN_4
-
-
-.L7_29:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
-
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L7_30:
-	testq	$2, M		
-	jz	.L7_40
-
-	ALIGN_4
-
-.L7_31:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_32:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L7_36
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L7_36
-
-	jmp	.L7_32
-	ALIGN_4
-
-.L7_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_37:
-
-	KERNEL2x3_SUB(xxx)
-	addq	$3, BI
-	addq	$2, %rax
-	jl	.L7_37
-	ALIGN_4
-
-
-.L7_39:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-	
-
-
-
-
-.L7_40:
-	testq	$1, M		
-	jz	.L7_60		// to next 6 lines of N
-
-	ALIGN_4
-
-.L7_41:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-
-	andq	$-8, %rax
-	je	.L7_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_42:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L7_46
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	prefetcht0	B_PR1+64(BO,BI,8)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,8)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L7_46
-
-	jmp	.L7_42
-	ALIGN_4
-
-.L7_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_47:
-
-	KERNEL1x3_SUB(xxx)
-	addq	$3, BI
-	addq	$1, %rax
-	jl	.L7_47
-	ALIGN_4
-
-
-.L7_49:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddsd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
-
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-
-.L7_60:
-
-	decq	J			// j --
-	jg	.L6_01
-
-
-.L2_0:
-	cmpq	$0, Nmod6		// N % 6 == 0
-	je	.L999
-
-/************************************************************************************************
-* Loop for Nmod6 / 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	sarq	$1, J			// j = j / 2
-	je	.L1_0
-	ALIGN_4
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL8x2_SUB(xxx)
-	addq	$2, BI
-	addq	$8, %rax
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
-	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
-
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$4, M		
-	jz	.L2_30
-
-	ALIGN_4
-
-.L2_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB(xxx)
-	addq	$2, BI
-	addq	$4, %rax
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB(xxx)
-	addq	$2, BI
-	addq	$2, %rax
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-	
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	addq	$2, BI
-	addq	$1, %rax
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-	
-.L2_60:
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovsd	(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_16
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL8x1_SUB(xxx)
-	addq	$1, BI
-	addq	$8, %rax
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L999
-
-	testq	$4, M		
-	jz	.L1_30
-
-	ALIGN_4
-
-.L1_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB(xxx)
-	addq	$1, BI
-	addq	$4, %rax
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB(xxx)
-	addq	$1, BI
-	addq	$2, %rax
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-
-	vmovups	%xmm4 ,  	(CO1)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-	
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	addq	$1, BI
-	addq	$1, %rax
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-
-	vmovsd	%xmm4 ,  	(CO1)
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#else
-/*************************************************************************************
-* TRMM Kernel
-*************************************************************************************/
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-.L2_0:
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL8x2_SUB(xxx)
-	addq	$2, BI
-	addq	$8, %rax
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-	vmulpd	%xmm0, %xmm7,%xmm7
-	vmulpd	%xmm0, %xmm10,%xmm10
-	vmulpd	%xmm0, %xmm13,%xmm13
-
-	vmulpd	%xmm0, %xmm5,%xmm5
-	vmulpd	%xmm0, %xmm8,%xmm8
-	vmulpd	%xmm0, %xmm11,%xmm11
-	vmulpd	%xmm0, %xmm14,%xmm14
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
-	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$4, M		
-	jz	.L2_30
-
-	ALIGN_4
-
-.L2_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB(xxx)
-	addq	$2, BI
-	addq	$4, %rax
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-	vmulpd	%xmm0, %xmm7,%xmm7
-
-	vmulpd	%xmm0, %xmm5,%xmm5
-	vmulpd	%xmm0, %xmm8,%xmm8
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB(xxx)
-	addq	$2, BI
-	addq	$2, %rax
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-	vmulpd	%xmm0, %xmm5,%xmm5
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-	
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        leaq    (AO, %rax, 8), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	addq	$2, BI
-	addq	$1, %rax
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-#else
-	vmulsd	%xmm0, %xmm4,%xmm4
-	vmulsd	%xmm0, %xmm5,%xmm5
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovsd	(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_16
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL8x1_SUB(xxx)
-	addq	$1, BI
-	addq	$8, %rax
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-	vmulpd	%xmm0, %xmm7,%xmm7
-	vmulpd	%xmm0, %xmm10,%xmm10
-	vmulpd	%xmm0, %xmm13,%xmm13
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L999
-
-	testq	$4, M		
-	jz	.L1_30
-
-	ALIGN_4
-
-.L1_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB(xxx)
-	addq	$1, BI
-	addq	$4, %rax
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-	vmulpd	%xmm0, %xmm7,%xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB(xxx)
-	addq	$1, BI
-	addq	$2, %rax
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-	
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	prefetcht0	B_PR1(BO,BI,8)
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	addq	$1, BI
-	addq	$1, %rax
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-
-#else
-	vmulsd	%xmm0, %xmm4,%xmm4
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-
-#endif
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+/*********************************************************************
+* 2013/06/02 Saar
+*
+* Parameter:
+* 	UNROLL_M	8
+*	UNROLL_N	2
+*	DGEMM_P		360
+*	DGEMM_Q		160
+*
+* Performance at m x n without prefetch of BO:
+* 
+* 5760x5760	93.4	GFLOPS with 8 threads on 4 modules (ACML: 90.8 GFLOPS)
+* 5760x5760	84.2	GFLOPS with 4 threads on 4 modules (ACML: 82.4 GFLOPS)
+* 3840x3840	50.3	GFLOPS with 2 threads on 2 modules (ACML: 49.5 GFLOPS)
+*
+* 5760x5760	56.4	GFLOPS with 4 threads on 2 modules (ACML: 58.5 GFLOPS)
+* 3840x3840	29.0	GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS)
+* 3840x3840	26.1	GFLOPS with 1 threads on 1 modules (ACML: 25.9 GFLOPS)
+*
+*********************************************************************/
+
+/*********************************************************************
+* 2013/06/03 Saar
+*
+* Parameter:
+* 	UNROLL_M	8
+*	UNROLL_N	2
+*	DGEMM_P		336
+*	DGEMM_Q		168
+*	NO_WARMUP	1
+*	NO_AFFINITY	1
+*	GEMM_MULTITHREAD_THRESHOLD 4
+*
+* Performance at m x n with prefetch of BO:
+* 
+* 8064x3840	93.7	GFLOPS with 8 threads on 4 modules (ACML: 93.6 GFLOPS)
+* 6048x2880	85.1	GFLOPS with 4 threads on 4 modules (ACML: 84.2 GFLOPS)
+* 6048x2880	52.0	GFLOPS with 2 threads on 2 modules (ACML: 50.0 GFLOPS)
+*
+* 6048x2880	56.3	GFLOPS with 4 threads on 2 modules (ACML: 57.6 GFLOPS)
+* 4032x1920	29.5	GFLOPS with 2 threads on 1 modules (ACML: 30.5 GFLOPS)
+* 4032x1920	26.9	GFLOPS with 1 threads on 1 modules (ACML: 26.1 GFLOPS)
+*
+*********************************************************************/
+
+/*********************************************************************
+* 2013/06/04 Saar
+*
+* Parameter:
+* 	UNROLL_M	8
+*	UNROLL_N	2
+*	DGEMM_P		384
+*	DGEMM_Q		168
+*	NO_WARMUP	1
+*	NO_AFFINITY	1
+*	GEMM_MULTITHREAD_THRESHOLD 4
+*
+* Performance at m x n with prefetch of BO:
+* 
+* 6144x5376	94.6	GFLOPS with 8 threads on 4 modules (ACML: 90.5 GFLOPS)
+* 6144x5376	86.0	GFLOPS with 4 threads on 4 modules (ACML: 81.5 GFLOPS)
+* 4608x4032	52.0	GFLOPS with 2 threads on 2 modules (ACML: 47.5 GFLOPS)
+*
+* 6144x5376	57.3	GFLOPS with 4 threads on 2 modules (ACML: 56.5 GFLOPS)
+* 4608x4032	29.6	GFLOPS with 2 threads on 1 modules (ACML: 30.2 GFLOPS)
+* 4608x4032	26.9	GFLOPS with 1 threads on 1 modules (ACML: 25.6 GFLOPS)
+*
+*********************************************************************/
+
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+#define LB2_OFFSET    4096
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+#define BUFFER2	LB2_OFFSET+128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+
+#define	A_PR1	384
+#define	B_PR1	192
+
+#define KERNEL8x3_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,8)	;\
+	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddpd  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vfmaddpd  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL8x3_2(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,8)	;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddpd  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vfmaddpd  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL8x3_3(xx) \
+	prefetcht0	A_PR1+128(AO,%rax,8)	;\
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	  0 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	  2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	  4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddpd  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	  6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vfmaddpd  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL8x3_4(xx) \
+	prefetcht0	A_PR1+192(AO,%rax,8)	;\
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	  8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  4 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	  5 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	 10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	 12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddpd  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	 14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vfmaddpd  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+	addq	$12, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x3_SUB(xx) \
+	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddpd  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vfmaddpd  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x3_1(xx) \
+	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL4x3_2(xx) \
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL4x3_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL4x3_4(xx) \
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  4 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	  5 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	addq	$12, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x3_SUB(xx) \
+	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+
+
+
+
+/*******************************************************************************************/
+
+#define KERNEL2x3_1(xx) \
+	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL2x3_2(xx) \
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL2x3_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL2x3_4(xx) \
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  4 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	  5 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	addq	$12, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x3_SUB(xx) \
+	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x3_1(xx) \
+	vmovsd	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_2(xx) \
+	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_3(xx) \
+	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovsd	  2 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_4(xx) \
+	vmovsd	  3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	  4 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovsd	  5 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	addq	$12, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x3_SUB(xx) \
+	vmovsd	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+
+
+/*******************************************************************************************
+* 2 lines of N
+*******************************************************************************************/
+
+#define KERNEL8x2_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,8)	;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL8x2_2(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,8)	;\
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL8x2_3(xx) \
+	prefetcht0	A_PR1+128(AO,%rax,8)	;\
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	  0 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	  2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	  4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	  6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL8x2_4(xx) \
+	prefetcht0	A_PR1+192(AO,%rax,8)	;\
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	  8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	 12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	 14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	addq	$8, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x2_SUB(xx) \
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x2_1(xx) \
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL4x2_2(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL4x2_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL4x2_4(xx) \
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	addq	$8, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x2_SUB(xx) \
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL2x2_1(xx) \
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL2x2_2(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL2x2_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL2x2_4(xx) \
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	addq	$8, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x2_SUB(xx) \
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x2_1(xx) \
+	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_2(xx) \
+	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_3(xx) \
+	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_4(xx) \
+	vmovsd	  2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	  3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	addq	$8, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x2_SUB(xx) \
+	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+
+
+/*******************************************************************************************
+* 1 line of N
+*******************************************************************************************/
+
+#define KERNEL8x1_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,8)	;\
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL8x1_2(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,8)	;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL8x1_3(xx) \
+	prefetcht0	A_PR1+128(AO,%rax,8)	;\
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	  0 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	  2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	  4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	  6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL8x1_4(xx) \
+	prefetcht0	A_PR1+192(AO,%rax,8)	;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	  8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	 12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	 14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	addq	$4, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x1_SUB(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x1_1(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL4x1_2(xx) \
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL4x1_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL4x1_4(xx) \
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	addq	$4, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x1_SUB(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL2x1_1(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL2x1_2(xx) \
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL2x1_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL2x1_4(xx) \
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	addq	$4, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x1_SUB(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x1_1(xx) \
+	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_2(xx) \
+	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_3(xx) \
+	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_4(xx) \
+	vmovsd	  1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	addq	$4, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x1_SUB(xx) \
+	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $6,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+	
+
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L2_0
+	ALIGN_4
+
+.L6_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $1,%rax                 // K * 2
+        movq    B, BO1
+        leaq    (B,%rax,8), BO2         // next offset to BO2
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+        sarq    $2, %rax                // K / 4
+        jz      .L6_02a
+        ALIGN_4
+
+.L6_02:
+	prefetcht0 512(BO1)
+	prefetcht0 512(BO2)
+	prefetchw  512(BO)
+	vmovups	      (BO1), %xmm0
+	vmovups	2*SIZE(BO1), %xmm2
+	vmovups	4*SIZE(BO1), %xmm4
+	vmovups	6*SIZE(BO1), %xmm6
+	vmovsd        (BO2), %xmm1
+	vmovsd  2*SIZE(BO2), %xmm3
+	vmovsd  4*SIZE(BO2), %xmm5
+	vmovsd  6*SIZE(BO2), %xmm7
+	vmovups	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovups	%xmm2, 3*SIZE(BO)
+	vmovsd	%xmm3, 5*SIZE(BO)
+	vmovups	%xmm4, 6*SIZE(BO)
+	vmovsd	%xmm5, 8*SIZE(BO)
+	vmovups	%xmm6, 9*SIZE(BO)
+	vmovsd	%xmm7,11*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+	decq	%rax
+	jnz	.L6_02
+
+.L6_02a:
+
+	movq	K, %rax
+	andq	$3, %rax		// K % 4
+	jz	.L6_02c
+	ALIGN_4
+
+.L6_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovsd  (BO2), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO2
+	addq	$3*SIZE,BO
+	decq	%rax
+	jnz	.L6_02b
+
+.L6_02c:
+
+	movq	K, %rax
+	salq	$1,%rax			// K * 2
+	leaq	(B,%rax,8), BO1		// next offset to BO1
+	leaq	(BO1,%rax,8), BO2	// next offset to BO1
+	leaq    BUFFER2, BO		// second buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// k / 4
+	jz	.L6_03a
+	ALIGN_4
+
+
+.L6_03:
+
+	prefetcht0 512(BO2)
+	prefetchw  512(BO)
+	vmovups	      (BO2), %xmm0
+	vmovups	2*SIZE(BO2), %xmm2
+	vmovups	4*SIZE(BO2), %xmm4
+	vmovups	6*SIZE(BO2), %xmm6
+	vmovsd  1*SIZE(BO1), %xmm1
+	vmovsd  3*SIZE(BO1), %xmm3
+	vmovsd  5*SIZE(BO1), %xmm5
+	vmovsd  7*SIZE(BO1), %xmm7
+	vmovsd	%xmm1, 0*SIZE(BO)
+	vmovups	%xmm0, 1*SIZE(BO)
+	vmovsd	%xmm3, 3*SIZE(BO)
+	vmovups	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm5, 6*SIZE(BO)
+	vmovups	%xmm4, 7*SIZE(BO)
+	vmovsd	%xmm7, 9*SIZE(BO)
+	vmovups	%xmm6,10*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+	decq	%rax
+	jnz	.L6_03
+
+.L6_03a:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L6_03c
+        ALIGN_4
+
+
+.L6_03b:
+
+	vmovsd	  1*SIZE(BO1), %xmm0
+	vmovups  	(BO2), %xmm1
+	vmovsd	%xmm0,       (BO)
+	vmovups %xmm1, 1*SIZE(BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO2
+	addq	$3*SIZE,BO
+	decq	%rax
+	jnz	.L6_03b
+
+
+.L6_03c:
+
+	movq	BO2, B			// next offset of B
+
+.L6_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L6_20
+
+	ALIGN_4
+
+.L6_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L6_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_12:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L6_16
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L6_16
+
+	jmp	.L6_12
+	ALIGN_4
+
+.L6_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_17:
+
+	KERNEL8x3_SUB(xxx)
+	addq	$3, BI
+	addq	$8, %rax
+	jl	.L6_17
+	ALIGN_4
+
+
+.L6_19:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+	vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+	vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
+	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm12, 4 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm15, 6 * SIZE(CO1, LDC, 2)
+
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L6_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L7_10		// to next 3 lines of N
+
+	testq	$4, M		
+	jz	.L6_30
+
+	ALIGN_4
+
+.L6_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_22:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L6_26
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L6_26
+
+	jmp	.L6_22
+	ALIGN_4
+
+.L6_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_27:
+
+	KERNEL4x3_SUB(xxx)
+	addq	$3, BI
+	addq	$4, %rax
+	jl	.L6_27
+	ALIGN_4
+
+
+.L6_29:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
+
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L6_30:
+	testq	$2, M		
+	jz	.L6_40
+
+	ALIGN_4
+
+.L6_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_32:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L6_36
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L6_36
+
+	jmp	.L6_32
+	ALIGN_4
+
+.L6_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_37:
+
+	KERNEL2x3_SUB(xxx)
+	addq	$3, BI
+	addq	$2, %rax
+	jl	.L6_37
+	ALIGN_4
+
+
+.L6_39:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L6_40:
+	testq	$1, M		
+	jz	.L7_10		// to next 3 lines of N
+
+	ALIGN_4
+
+.L6_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_42:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L6_46
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L6_46
+
+	jmp	.L6_42
+	ALIGN_4
+
+.L6_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_47:
+
+	KERNEL1x3_SUB(xxx)
+	addq	$3, BI
+	addq	$1, %rax
+	jl	.L6_47
+	ALIGN_4
+
+
+.L6_49:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddsd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
+
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+/***************************************************************************************************************/
+
+.L7_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L7_20
+	ALIGN_4
+
+.L7_11:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+
+	vzeroall
+
+        movq    K, %rax
+
+
+	andq	$-8, %rax
+	je	.L7_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+
+	ALIGN_4
+
+.L7_12:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L7_16
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L7_16
+
+	jmp	.L7_12
+	ALIGN_4
+
+.L7_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_17:
+
+	KERNEL8x3_SUB(xxx)
+	addq	$3, BI
+	addq	$8, %rax
+	jl	.L7_17
+	ALIGN_4
+
+
+.L7_19:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+	vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+	vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
+	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm12, 4 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm15, 6 * SIZE(CO1, LDC, 2)
+
+
+
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L7_11
+	ALIGN_4	
+
+.L7_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L7_60		// to next 6 lines of N
+
+	testq	$4, M		
+	jz	.L7_30
+
+	ALIGN_4
+
+.L7_21:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_22:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L7_26
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L7_26
+
+	jmp	.L7_22
+	ALIGN_4
+
+.L7_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_27:
+
+	KERNEL4x3_SUB(xxx)
+	addq	$3, BI
+	addq	$4, %rax
+	jl	.L7_27
+	ALIGN_4
+
+
+.L7_29:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
+
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L7_30:
+	testq	$2, M		
+	jz	.L7_40
+
+	ALIGN_4
+
+.L7_31:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_32:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L7_36
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L7_36
+
+	jmp	.L7_32
+	ALIGN_4
+
+.L7_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_37:
+
+	KERNEL2x3_SUB(xxx)
+	addq	$3, BI
+	addq	$2, %rax
+	jl	.L7_37
+	ALIGN_4
+
+
+.L7_39:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+	
+
+
+
+
+.L7_40:
+	testq	$1, M		
+	jz	.L7_60		// to next 6 lines of N
+
+	ALIGN_4
+
+.L7_41:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+
+	andq	$-8, %rax
+	je	.L7_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_42:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L7_46
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	prefetcht0	B_PR1+64(BO,BI,8)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,8)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L7_46
+
+	jmp	.L7_42
+	ALIGN_4
+
+.L7_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_47:
+
+	KERNEL1x3_SUB(xxx)
+	addq	$3, BI
+	addq	$1, %rax
+	jl	.L7_47
+	ALIGN_4
+
+
+.L7_49:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddsd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
+
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+
+.L7_60:
+
+	decq	J			// j --
+	jg	.L6_01
+
+
+.L2_0:
+	cmpq	$0, Nmod6		// N % 6 == 0
+	je	.L999
+
+/************************************************************************************************
+* Loop for Nmod6 / 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	sarq	$1, J			// j = j / 2
+	je	.L1_0
+	ALIGN_4
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL8x2_SUB(xxx)
+	addq	$2, BI
+	addq	$8, %rax
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
+	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
+
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$4, M		
+	jz	.L2_30
+
+	ALIGN_4
+
+.L2_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB(xxx)
+	addq	$2, BI
+	addq	$4, %rax
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB(xxx)
+	addq	$2, BI
+	addq	$2, %rax
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+	
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	addq	$2, BI
+	addq	$1, %rax
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+	
+.L2_60:
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_16
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL8x1_SUB(xxx)
+	addq	$1, BI
+	addq	$8, %rax
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L999
+
+	testq	$4, M		
+	jz	.L1_30
+
+	ALIGN_4
+
+.L1_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB(xxx)
+	addq	$1, BI
+	addq	$4, %rax
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB(xxx)
+	addq	$1, BI
+	addq	$2, %rax
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+
+	vmovups	%xmm4 ,  	(CO1)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+	
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	addq	$1, BI
+	addq	$1, %rax
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+
+	vmovsd	%xmm4 ,  	(CO1)
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+.L2_0:
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL8x2_SUB(xxx)
+	addq	$2, BI
+	addq	$8, %rax
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+	vmulpd	%xmm0, %xmm7,%xmm7
+	vmulpd	%xmm0, %xmm10,%xmm10
+	vmulpd	%xmm0, %xmm13,%xmm13
+
+	vmulpd	%xmm0, %xmm5,%xmm5
+	vmulpd	%xmm0, %xmm8,%xmm8
+	vmulpd	%xmm0, %xmm11,%xmm11
+	vmulpd	%xmm0, %xmm14,%xmm14
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
+	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$4, M		
+	jz	.L2_30
+
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB(xxx)
+	addq	$2, BI
+	addq	$4, %rax
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+	vmulpd	%xmm0, %xmm7,%xmm7
+
+	vmulpd	%xmm0, %xmm5,%xmm5
+	vmulpd	%xmm0, %xmm8,%xmm8
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB(xxx)
+	addq	$2, BI
+	addq	$2, %rax
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+	vmulpd	%xmm0, %xmm5,%xmm5
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+	
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        leaq    (AO, %rax, 8), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	addq	$2, BI
+	addq	$1, %rax
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+#else
+	vmulsd	%xmm0, %xmm4,%xmm4
+	vmulsd	%xmm0, %xmm5,%xmm5
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_16
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL8x1_SUB(xxx)
+	addq	$1, BI
+	addq	$8, %rax
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+	vmulpd	%xmm0, %xmm7,%xmm7
+	vmulpd	%xmm0, %xmm10,%xmm10
+	vmulpd	%xmm0, %xmm13,%xmm13
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L999
+
+	testq	$4, M		
+	jz	.L1_30
+
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB(xxx)
+	addq	$1, BI
+	addq	$4, %rax
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+	vmulpd	%xmm0, %xmm7,%xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB(xxx)
+	addq	$1, BI
+	addq	$2, %rax
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+	
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	prefetcht0	B_PR1(BO,BI,8)
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	addq	$1, BI
+	addq	$1, %rax
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+
+#else
+	vmulsd	%xmm0, %xmm4,%xmm4
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+#endif
diff --git a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S
index adc00cca3..48eb1bcbe 100644
--- a/kernel/x86_64/dgemm_kernel_8x2_piledriver.S
+++ b/kernel/x86_64/dgemm_kernel_8x2_piledriver.S
@@ -1,4523 +1,4523 @@
-/***************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-
-/*********************************************************************
-*
-* 2013/11/13 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-*
-* 2013/10/31 Saar
-*
-* Parameter:
-* 	UNROLL_M	8
-*	UNROLL_N	2
-*	DGEMM_P		768
-*	DGEMM_Q		168
-*	DGEMM_R		12288
-*	A_PR1		512
-*	B_PR1		256
-*
-* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
-* 
-* 4608x4608	83.9	GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS)
-* 4608x4608	80.9	GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS)
-* 4608x4608	41.3    GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS)
-* 4608x4608	20.7	GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS)
-*
-* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
-* 
-* 13824x13824  234.5	GFLOPS with 32 threads on 16 modules (ACML:  88.5 GFLOPS) !strange thermal behavior
-* 13824x13824  241.9	GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior
-* 9216x9216    137.6	GFLOPS with  8 threads on  8 modules (ACML: 106.5 GFLOPS)
-* 4608x4608	75.7	GFLOPS with  4 threads on  4 modules (ACML:  56.3 GFLOPS)
-* 4608x4608	38.6	GFLOPS with  2 threads on  2 modules (ACML:  34.1 GFLOPS)
-* 4608x4608	19.6	GFLOPS with  1 threads on  1 modules (ACML:  18.3 GFLOPS)
-*
-*********************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 256
-
-#define OLD_A		40 + STACKSIZE(%rsp)
-#define OLD_B		48 + STACKSIZE(%rsp)
-#define OLD_C		56 + STACKSIZE(%rsp)
-#define OLD_LDC		64 + STACKSIZE(%rsp)
-#define OLD_OFFSET	72 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-#define LB2_OFFSET    4096
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA	 48(%rsp)
-#define OFFSET	 56(%rsp)
-#define KK	 64(%rsp)
-#define KKK	 72(%rsp)
-#define BUFFER1	           128(%rsp)
-#define BUFFER2	LB2_OFFSET+128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-#if defined(BULLDOZER)
-
-#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0
-
-#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0
-
-#else
-
-#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0
-
-#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0
-
-#endif
-
-
-
-
-#define	A_PR1	512
-#define	B_PR1	256
-#define	C_PR1	64
-
-.macro INIT8x3
-	vxorpd	%xmm4 , %xmm4 , %xmm4
-	vxorpd	%xmm5 , %xmm5 , %xmm5
-	vxorpd	%xmm6 , %xmm6 , %xmm6
-	vxorpd	%xmm7 , %xmm7 , %xmm7
-	vxorpd	%xmm8 , %xmm8 , %xmm8
-	vxorpd	%xmm9 , %xmm9 , %xmm9
-	vxorpd	%xmm10, %xmm10, %xmm10
-	vxorpd	%xmm11, %xmm11, %xmm11
-	vxorpd	%xmm12, %xmm12, %xmm12
-	vxorpd	%xmm13, %xmm13, %xmm13
-	vxorpd	%xmm14, %xmm14, %xmm14
-	vxorpd	%xmm15, %xmm15, %xmm15
-.endm
-
-.macro KERNEL8x3_INIT 
-	vmovddup	-12 * SIZE(BO), %xmm1
-	vmovups 	-16 * SIZE(AO), %xmm0
-	prefetcht0	A_PR1(AO)
-	vmulpd  	%xmm1,%xmm0,%xmm4
-	vmovddup	-11 * SIZE(BO), %xmm2
-	vmulpd  	%xmm2,%xmm0,%xmm5
-	vmovddup	-10 * SIZE(BO), %xmm3
-	vmulpd  	%xmm3,%xmm0,%xmm6
-	vmovups 	-14 * SIZE(AO), %xmm0
-	vmulpd  	%xmm1,%xmm0,%xmm7
-	vmulpd  	%xmm2,%xmm0,%xmm8
-	vmulpd  	%xmm3,%xmm0,%xmm9
-	vmovups 	-12 * SIZE(AO), %xmm0
-	vmulpd  	%xmm1,%xmm0,%xmm10
-	vmulpd  	%xmm2,%xmm0,%xmm11
-	addq		$ 3 * SIZE, BO
-	vmulpd  	%xmm3,%xmm0,%xmm12
-	vmovups 	-10 * SIZE(AO), %xmm0
-	vmulpd  	%xmm1,%xmm0,%xmm13
-	vmovddup	-12 * SIZE(BO), %xmm1
-	vmulpd  	%xmm2,%xmm0,%xmm14
-	vmovddup	-11 * SIZE(BO), %xmm2
-	vmulpd  	%xmm3,%xmm0,%xmm15
-.endm
-
-
-.macro KERNEL8x3_M1 
-	vmovups 	-16 * SIZE(AO), %xmm0
-	prefetcht0	A_PR1(AO)
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
-	vmovups 	-14 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
-	vmovups 	-12 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
-	vmovups 	-10 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
-	vmovddup	-12 * SIZE(BO), %xmm1
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
-	vmovddup	-11 * SIZE(BO), %xmm2
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
-.endm
-
-.macro KERNEL8x3_M2 
-	vmovups 	 -8 * SIZE(AO), %xmm0
-	prefetcht0	A_PR1+64(AO)
-	vmovddup	-10 * SIZE(BO), %xmm3
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
-	vmovups 	 -6 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
-	vmovups 	 -4 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
-	vmovups 	 -2 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
-	vmovddup	 -9 * SIZE(BO), %xmm1
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
-	vmovddup	 -8 * SIZE(BO), %xmm2
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
-.endm
-
-
-.macro KERNEL8x3_M3 
-	vmovups 	  0 * SIZE(AO), %xmm0
-	prefetcht0	A_PR1+128(AO)
-	vmovddup	 -7 * SIZE(BO), %xmm3
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
-	vmovups 	  2 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
-	vmovups 	  4 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
-	vmovups 	  6 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
-	vmovddup	 -6 * SIZE(BO), %xmm1
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
-	vmovddup	 -5 * SIZE(BO), %xmm2
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
-.endm
-
-.macro KERNEL8x3_M4 
-	vmovups 	  8 * SIZE(AO), %xmm0
-	prefetcht0	A_PR1+192(AO)
-	vmovddup	 -4 * SIZE(BO), %xmm3
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
-	vmovups 	 10 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
-	vmovups 	 12 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
-	vmovups 	 14 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
-	vmovddup	 -3 * SIZE(BO), %xmm1
-	addq		$ 32 * SIZE, AO
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
-	vmovddup	 -2 * SIZE(BO), %xmm2
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
-.endm
-
-.macro KERNEL8x3_M5 
-	vmovups 	-16 * SIZE(AO), %xmm0
-	prefetcht0	A_PR1(AO)
-	vmovddup	 -1 * SIZE(BO), %xmm3
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
-	vmovups 	-14 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
-	vmovups 	-12 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
-	vmovups 	-10 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
-	vmovddup	  0 * SIZE(BO), %xmm1
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
-	vmovddup	  1 * SIZE(BO), %xmm2
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
-.endm
-
-.macro KERNEL8x3_M6 
-	vmovups 	 -8 * SIZE(AO), %xmm0
-	prefetcht0	A_PR1+64(AO)
-	vmovddup	  2 * SIZE(BO), %xmm3
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
-	vmovups 	 -6 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
-	vmovups 	 -4 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
-	vmovups 	 -2 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
-	vmovddup	  3 * SIZE(BO), %xmm1
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
-	vmovddup	  4 * SIZE(BO), %xmm2
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
-.endm
-
-
-.macro KERNEL8x3_M7 
-	vmovups 	  0 * SIZE(AO), %xmm0
-	prefetcht0	A_PR1+128(AO)
-	vmovddup	  5 * SIZE(BO), %xmm3
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
-	vmovups 	  2 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
-	vmovups 	  4 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
-	vmovups 	  6 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
-	vmovddup	  6 * SIZE(BO), %xmm1
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
-	vmovddup	  7 * SIZE(BO), %xmm2
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
-.endm
-
-.macro KERNEL8x3_M8 
-	vmovups 	  8 * SIZE(AO), %xmm0
-	prefetcht0	A_PR1+192(AO)
-	vmovddup	  8 * SIZE(BO), %xmm3
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
-	vmovups 	 10 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
-	vmovups 	 12 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
-	vmovups 	 14 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
-	vmovddup	  9 * SIZE(BO), %xmm1
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
-	vmovddup	 10 * SIZE(BO), %xmm2
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
-	vmovddup	 11 * SIZE(BO), %xmm3
-	addq		$ 32 * SIZE, AO
-	addq		$ 24 * SIZE, BO
-.endm
-
-
-.macro KERNEL8x3_E 
-	vmovups 	  8 * SIZE(AO), %xmm0
-	prefetcht0	A_PR1+192(AO)
-	vmovddup	  8 * SIZE(BO), %xmm3
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
-	vmovups 	 10 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
-	vmovups 	 12 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
-	vmovups 	 14 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
-	addq		$ 32 * SIZE, AO
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
-	addq		$ 21 * SIZE, BO
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
-.endm
-
-.macro KERNEL8x3_SUBN 
-	vmovddup	-12 * SIZE(BO), %xmm1
-	vmovups 	-16 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
-	vmovddup	-11 * SIZE(BO), %xmm2
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
-	vmovddup	-10 * SIZE(BO), %xmm3
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
-	vmovups 	-14 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
-	vmovups 	-12 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
-	vmovups 	-10 * SIZE(AO), %xmm0
-	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
-	addq		$ 3 * SIZE, BO
-	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
-	addq		$ 8 * SIZE, AO
-	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
-.endm
-
-.macro	SAVE8x3
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-	vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-	vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
-	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm12, 4 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm15, 6 * SIZE(CO1, LDC, 2)
-
-	prefetcht0	 C_PR1(CO1)
-	prefetcht0	 C_PR1(CO1,LDC)
-	prefetcht0	 C_PR1(CO1,LDC,2)
-
-	addq	$ 8 * SIZE, CO1		# coffset += 8
-.endm
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x3_1(xx) \
-	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL4x3_2(xx) \
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL4x3_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL4x3_4(xx) \
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  4 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	  5 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	addq	$12, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x3_SUB(xx) \
-	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-
-
-
-
-/*******************************************************************************************/
-
-#define KERNEL2x3_1(xx) \
-	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL2x3_2(xx) \
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL2x3_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL2x3_4(xx) \
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  4 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	  5 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	addq	$12, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x3_SUB(xx) \
-	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x3_1(xx) \
-	vmovsd	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_2(xx) \
-	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_3(xx) \
-	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovsd	  2 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_4(xx) \
-	vmovsd	  3 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	  4 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovsd	  5 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	addq	$12, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x3_SUB(xx) \
-	vmovsd	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
-	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-
-
-/*******************************************************************************************
-* 2 lines of N
-*******************************************************************************************/
-
-#define KERNEL8x2_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,8)	;\
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL8x2_2(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,8)	;\
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL8x2_3(xx) \
-	prefetcht0	A_PR1+128(AO,%rax,8)	;\
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	  0 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	  2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	  4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	  6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL8x2_4(xx) \
-	prefetcht0	A_PR1+192(AO,%rax,8)	;\
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	  8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	 12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	 14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	addq	$8, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x2_SUB(xx) \
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x2_1(xx) \
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL4x2_2(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL4x2_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL4x2_4(xx) \
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	addq	$8, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x2_SUB(xx) \
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL2x2_1(xx) \
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL2x2_2(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL2x2_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL2x2_4(xx) \
-	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	addq	$8, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x2_SUB(xx) \
-	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x2_1(xx) \
-	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_2(xx) \
-	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_3(xx) \
-	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	  1 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_4(xx) \
-	vmovsd	  2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	  3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	addq	$8, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x2_SUB(xx) \
-	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
-	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-
-
-/*******************************************************************************************
-* 1 line of N
-*******************************************************************************************/
-
-#define KERNEL8x1_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,8)	;\
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL8x1_2(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,8)	;\
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL8x1_3(xx) \
-	prefetcht0	A_PR1+128(AO,%rax,8)	;\
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	  0 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	  2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	  4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	  6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL8x1_4(xx) \
-	prefetcht0	A_PR1+192(AO,%rax,8)	;\
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	  8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	 12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	 14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	addq	$4, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x1_SUB(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x1_1(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL4x1_2(xx) \
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL4x1_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL4x1_4(xx) \
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	addq	$4, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x1_SUB(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL2x1_1(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL2x1_2(xx) \
-	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL2x1_3(xx) \
-	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL2x1_4(xx) \
-	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	addq	$4, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x1_SUB(xx) \
-	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x1_1(xx) \
-	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_2(xx) \
-	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_3(xx) \
-	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_4(xx) \
-	vmovsd	  1 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	addq	$4, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x1_SUB(xx) \
-	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
-	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
-	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-
-/*******************************************************************************************/
-
-#if !defined(TRMMKERNEL)
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $6,  %rdi
-        divq    %rdi                    //    N / 6
-        movq    %rax, Ndiv6             //    N / 6
-        movq    %rdx, Nmod6             //    N % 6
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L2_0
-	ALIGN_4
-
-.L6_01:
-        // copy to sub buffer
-        movq    K, %rax
-        salq    $1,%rax                 // K * 2
-        movq    B, BO1
-        leaq    (B,%rax,8), BO2         // next offset to BO2
-        leaq    BUFFER1, BO             // first buffer to BO
-        movq    K, %rax
-        sarq    $2, %rax                // K / 4
-        jz      .L6_02a
-        ALIGN_4
-
-.L6_02:
-	prefetcht0 B_PR1(BO1)
-	prefetcht0 B_PR1(BO2)
-	prefetchw  B_PR1(BO)
-	vmovups	      (BO1), %xmm0
-	vmovups	2*SIZE(BO1), %xmm2
-	vmovups	4*SIZE(BO1), %xmm4
-	vmovups	6*SIZE(BO1), %xmm6
-	vmovsd        (BO2), %xmm1
-	vmovsd  2*SIZE(BO2), %xmm3
-	vmovsd  4*SIZE(BO2), %xmm5
-	vmovsd  6*SIZE(BO2), %xmm7
-	vmovups	%xmm0,       (BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	vmovups	%xmm2, 3*SIZE(BO)
-	vmovsd	%xmm3, 5*SIZE(BO)
-	vmovups	%xmm4, 6*SIZE(BO)
-	vmovsd	%xmm5, 8*SIZE(BO)
-	vmovups	%xmm6, 9*SIZE(BO)
-	vmovsd	%xmm7,11*SIZE(BO)
-	addq	$ 8*SIZE,BO1
-	addq	$ 8*SIZE,BO2
-	addq	$ 12*SIZE,BO
-	decq	%rax
-	jnz	.L6_02
-
-.L6_02a:
-
-	movq	K, %rax
-	andq	$3, %rax		// K % 4
-	jz	.L6_02c
-	ALIGN_4
-
-.L6_02b:
-
-	vmovups	(BO1), %xmm0
-	vmovsd  (BO2), %xmm1
-	vmovups	%xmm0,       (BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	addq	$ 2*SIZE,BO1
-	addq	$ 2*SIZE,BO2
-	addq	$ 3*SIZE,BO
-	decq	%rax
-	jnz	.L6_02b
-
-.L6_02c:
-
-	movq	K, %rax
-	salq	$1,%rax			// K * 2
-	leaq	(B,%rax,8), BO1		// next offset to BO1
-	leaq	(BO1,%rax,8), BO2	// next offset to BO1
-	leaq    BUFFER2, BO		// second buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// k / 4
-	jz	.L6_03a
-	ALIGN_4
-
-
-.L6_03:
-
-	prefetcht0 B_PR1(BO2)
-	prefetchw  B_PR1(BO)
-	vmovups	      (BO2), %xmm0
-	vmovups	2*SIZE(BO2), %xmm2
-	vmovups	4*SIZE(BO2), %xmm4
-	vmovups	6*SIZE(BO2), %xmm6
-	vmovsd  1*SIZE(BO1), %xmm1
-	vmovsd  3*SIZE(BO1), %xmm3
-	vmovsd  5*SIZE(BO1), %xmm5
-	vmovsd  7*SIZE(BO1), %xmm7
-	vmovsd	%xmm1, 0*SIZE(BO)
-	vmovups	%xmm0, 1*SIZE(BO)
-	vmovsd	%xmm3, 3*SIZE(BO)
-	vmovups	%xmm2, 4*SIZE(BO)
-	vmovsd	%xmm5, 6*SIZE(BO)
-	vmovups	%xmm4, 7*SIZE(BO)
-	vmovsd	%xmm7, 9*SIZE(BO)
-	vmovups	%xmm6,10*SIZE(BO)
-	addq	$ 8*SIZE,BO1
-	addq	$ 8*SIZE,BO2
-	addq	$ 12*SIZE,BO
-	decq	%rax
-	jnz	.L6_03
-
-.L6_03a:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L6_03c
-        ALIGN_4
-
-
-.L6_03b:
-
-	vmovsd	  1*SIZE(BO1), %xmm0
-	vmovups  	(BO2), %xmm1
-	vmovsd	%xmm0,       (BO)
-	vmovups %xmm1, 1*SIZE(BO)
-	addq	$ 2*SIZE,BO1
-	addq	$ 2*SIZE,BO2
-	addq	$ 3*SIZE,BO
-	decq	%rax
-	jnz	.L6_03b
-
-
-.L6_03c:
-
-	movq	BO2, B			// next offset of B
-
-.L6_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		 
-	leaq	(C, LDC, 1), C		// c += 3 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L6_20
-
-	ALIGN_4
-
-.L6_11:
-
-        leaq    BUFFER1, BO             // first buffer to BO
-	addq	$12 * SIZE, BO
-        movq    K, %rax
-	sarq	$3, %rax			//  K / 8
-	cmpq	$3, %rax
-	jl	.L6_13
-
-	prefetcht0 B_PR1(BO)
-	prefetcht0 B_PR1+64(BO)
-	prefetcht0 B_PR1+128(BO)
-	KERNEL8x3_INIT
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_M8
-
-	subq	$2, %rax
-
-	ALIGN_5
-
-.L6_12:
-
-	prefetcht0 B_PR1-24(BO)
-	prefetcht0 B_PR1+40(BO)
-	KERNEL8x3_M1
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	prefetcht0 B_PR1+104(BO)
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_M8
-
-	dec	%rax
-	jne	.L6_12
-
-.L6_12_E:
-
-	prefetcht0 B_PR1(BO)
-	prefetcht0 B_PR1+64(BO)
-	KERNEL8x3_M1
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_E
-
-	jmp	.L6_16
-
-.L6_13:
-
-	test $2, %rax
-	jz	.L6_14
-
-	KERNEL8x3_INIT
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_M8
-
-	KERNEL8x3_M1
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_E
-
-	jmp	.L6_16
-
-
-.L6_14:
-
-	test $1, %rax
-	jz	.L6_15
-	
-	KERNEL8x3_INIT
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_E
-
-
-	jmp	.L6_16
-
-.L6_15:
-
-	INIT8x3
-
-.L6_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_19
-
-	ALIGN_4
-
-.L6_17:
-
-	KERNEL8x3_SUBN
-	dec	%rax
-	jne	.L6_17
-	ALIGN_4
-
-
-.L6_19:
-
-	SAVE8x3
-
-	decq	I			# i --
-	jg	.L6_11
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L6_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L7_10		// to next 3 lines of N
-
-	testq	$4, M		
-	jz	.L6_30
-
-	ALIGN_4
-
-.L6_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_22:
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L6_26
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L6_26
-
-	jmp	.L6_22
-	ALIGN_4
-
-.L6_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_27:
-
-	KERNEL4x3_SUB(xxx)
-	addq	$3, BI
-	addq	$4, %rax
-	jl	.L6_27
-	ALIGN_4
-
-
-.L6_29:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
-
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L6_30:
-	testq	$2, M		
-	jz	.L6_40
-
-	ALIGN_4
-
-.L6_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_32:
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L6_36
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L6_36
-
-	jmp	.L6_32
-	ALIGN_4
-
-.L6_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_37:
-
-	KERNEL2x3_SUB(xxx)
-	addq	$3, BI
-	addq	$2, %rax
-	jl	.L6_37
-	ALIGN_4
-
-
-.L6_39:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L6_40:
-	testq	$1, M		
-	jz	.L7_10		// to next 3 lines of N
-
-	ALIGN_4
-
-.L6_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_42:
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L6_46
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L6_46
-
-	jmp	.L6_42
-	ALIGN_4
-
-.L6_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_47:
-
-	KERNEL1x3_SUB(xxx)
-	addq	$3, BI
-	addq	$1, %rax
-	jl	.L6_47
-	ALIGN_4
-
-
-.L6_49:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddsd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
-
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-/***************************************************************************************************************/
-
-.L7_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		 
-	leaq	(C, LDC, 1), C		// c += 3 * ldc
-
-
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L7_20
-	ALIGN_4
-
-.L7_11:
-
-        leaq    BUFFER2, BO             // first buffer to BO
-	addq	$12 * SIZE, BO
-        movq    K, %rax
-	sarq	$3, %rax			//  K / 8
-	cmpq	$3, %rax
-	jl	.L7_13
-
-	prefetcht0 B_PR1(BO)
-	prefetcht0 B_PR1+64(BO)
-	prefetcht0 B_PR1+128(BO)
-	KERNEL8x3_INIT
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_M8
-
-	subq	$2, %rax
-
-	ALIGN_5
-
-.L7_12:
-
-	prefetcht0 B_PR1-24(BO)
-	prefetcht0 B_PR1+40(BO)
-	KERNEL8x3_M1
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	prefetcht0 B_PR1+104(BO)
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_M8
-
-	dec	%rax
-	jne	.L7_12
-
-.L7_12_E:
-
-	prefetcht0 B_PR1(BO)
-	prefetcht0 B_PR1+64(BO)
-	KERNEL8x3_M1
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_E
-
-	jmp	.L7_16
-
-
-
-.L7_13:
-
-	test $2, %rax
-	jz	.L7_14
-
-	KERNEL8x3_INIT
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_M8
-
-	KERNEL8x3_M1
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_E
-
-	jmp	.L7_16
-
-
-.L7_14:
-
-	test $1, %rax
-	jz	.L7_15
-	
-	KERNEL8x3_INIT
-	KERNEL8x3_M2
-	KERNEL8x3_M3
-	KERNEL8x3_M4
-	KERNEL8x3_M5
-	KERNEL8x3_M6
-	KERNEL8x3_M7
-	KERNEL8x3_E
-
-	jmp	.L7_16
-
-
-
-.L7_15:
-
-	INIT8x3
-
-.L7_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_19
-
-
-	ALIGN_4
-
-.L7_17:
-
-	KERNEL8x3_SUBN
-	dec	%rax
-	jne	.L7_17
-	ALIGN_4
-
-
-.L7_19:
-
-	SAVE8x3
-
-	decq	I			# i --
-	jg	.L7_11
-	ALIGN_4	
-
-.L7_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L7_60		// to next 6 lines of N
-
-	testq	$4, M		
-	jz	.L7_30
-
-	ALIGN_4
-
-.L7_21:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_22:
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L7_26
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L7_26
-
-	jmp	.L7_22
-	ALIGN_4
-
-.L7_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_27:
-
-	KERNEL4x3_SUB(xxx)
-	addq	$3, BI
-	addq	$4, %rax
-	jl	.L7_27
-	ALIGN_4
-
-
-.L7_29:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
-
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L7_30:
-	testq	$2, M		
-	jz	.L7_40
-
-	ALIGN_4
-
-.L7_31:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_32:
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L7_36
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L7_36
-
-	jmp	.L7_32
-	ALIGN_4
-
-.L7_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_37:
-
-	KERNEL2x3_SUB(xxx)
-	addq	$3, BI
-	addq	$2, %rax
-	jl	.L7_37
-	ALIGN_4
-
-
-.L7_39:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-	
-
-
-
-
-.L7_40:
-	testq	$1, M		
-	jz	.L7_60		// to next 6 lines of N
-
-	ALIGN_4
-
-.L7_41:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-
-	andq	$-8, %rax
-	je	.L7_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_42:
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L7_46
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L7_46
-
-	jmp	.L7_42
-	ALIGN_4
-
-.L7_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_47:
-
-	KERNEL1x3_SUB(xxx)
-	addq	$3, BI
-	addq	$1, %rax
-	jl	.L7_47
-	ALIGN_4
-
-
-.L7_49:
-
-	vmovddup	ALPHA, %xmm0
-
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddsd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
-
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-
-.L7_60:
-
-	decq	J			// j --
-	jg	.L6_01
-
-
-.L2_0:
-	cmpq	$0, Nmod6		// N % 6 == 0
-	je	.L999
-
-/************************************************************************************************
-* Loop for Nmod6 / 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	sarq	$1, J			// j = j / 2
-	je	.L1_0
-	ALIGN_4
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_16
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL8x2_SUB(xxx)
-	addq	$2, BI
-	addq	$8, %rax
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
-	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
-
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$4, M		
-	jz	.L2_30
-
-	ALIGN_4
-
-.L2_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB(xxx)
-	addq	$2, BI
-	addq	$4, %rax
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB(xxx)
-	addq	$2, BI
-	addq	$2, %rax
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-	
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	addq	$2, BI
-	addq	$1, %rax
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-	
-.L2_60:
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovsd	(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_16
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL8x1_SUB(xxx)
-	addq	$1, BI
-	addq	$8, %rax
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L999
-
-	testq	$4, M		
-	jz	.L1_30
-
-	ALIGN_4
-
-.L1_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB(xxx)
-	addq	$1, BI
-	addq	$4, %rax
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB(xxx)
-	addq	$1, BI
-	addq	$2, %rax
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-
-	vmovups	%xmm4 ,  	(CO1)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-	
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	addq	$1, BI
-	addq	$1, %rax
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovddup	ALPHA, %xmm0
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-
-	vmovsd	%xmm4 ,  	(CO1)
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#else
-/*************************************************************************************
-* TRMM Kernel
-*************************************************************************************/
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 2
-        movq    %rax, Ndiv6             //    N / 2
-        movq    %rdx, Nmod6             //    N % 2
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-.L2_0:
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-#else
-        movq    KK, %rax
-	leaq	BUFFER1, BO		// first buffer to BO
-	addq	$4 * SIZE, BO
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_16
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL8x2_SUB(xxx)
-	addq	$2, BI
-	addq	$8, %rax
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-	vmulpd	%xmm0, %xmm7,%xmm7
-	vmulpd	%xmm0, %xmm10,%xmm10
-	vmulpd	%xmm0, %xmm13,%xmm13
-
-	vmulpd	%xmm0, %xmm5,%xmm5
-	vmulpd	%xmm0, %xmm8,%xmm8
-	vmulpd	%xmm0, %xmm11,%xmm11
-	vmulpd	%xmm0, %xmm14,%xmm14
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
-	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$4, M		
-	jz	.L2_30
-
-	ALIGN_4
-
-.L2_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB(xxx)
-	addq	$2, BI
-	addq	$4, %rax
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-	vmulpd	%xmm0, %xmm7,%xmm7
-
-	vmulpd	%xmm0, %xmm5,%xmm5
-	vmulpd	%xmm0, %xmm8,%xmm8
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB(xxx)
-	addq	$2, BI
-	addq	$2, %rax
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-	vmulpd	%xmm0, %xmm5,%xmm5
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-	
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        leaq    (AO, %rax, 8), AO
-#endif
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	addq	$2, BI
-	addq	$1, %rax
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-#else
-	vmulsd	%xmm0, %xmm4,%xmm4
-	vmulsd	%xmm0, %xmm5,%xmm5
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-	vmovsd	%xmm5 ,  	(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, 8), BO
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovsd	(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$3, I			// i = (m >> 3)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_16
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL8x1_SUB(xxx)
-	addq	$1, BI
-	addq	$8, %rax
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-	vmulpd	%xmm0, %xmm7,%xmm7
-	vmulpd	%xmm0, %xmm10,%xmm10
-	vmulpd	%xmm0, %xmm13,%xmm13
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-	vmovups	%xmm10, 4 * SIZE(CO1)
-	vmovups	%xmm13, 6 * SIZE(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-	addq	$8 * SIZE, CO1		# coffset += 8
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$7, M
-	jz	.L999
-
-	testq	$4, M		
-	jz	.L1_30
-
-	ALIGN_4
-
-.L1_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB(xxx)
-	addq	$1, BI
-	addq	$4, %rax
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-	vmulpd	%xmm0, %xmm7,%xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 2 * SIZE(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB(xxx)
-	addq	$1, BI
-	addq	$2, %rax
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
-
-#else
-	vmulpd	%xmm0, %xmm4,%xmm4
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-	
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, 8), AO
-	leaq	(BO, BI, 8), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	addq	$1, BI
-	addq	$1, %rax
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovddup	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
-
-#else
-	vmulsd	%xmm0, %xmm4,%xmm4
-
-#endif
-
-	vmovsd	%xmm4 ,  	(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, 8), BO
-        leaq    (AO, %rax, 8), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-
-#endif
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/*********************************************************************
+*
+* 2013/11/13 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2013/10/31 Saar
+*
+* Parameter:
+* 	UNROLL_M	8
+*	UNROLL_N	2
+*	DGEMM_P		768
+*	DGEMM_Q		168
+*	DGEMM_R		12288
+*	A_PR1		512
+*	B_PR1		256
+*
+* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
+* 
+* 4608x4608	83.9	GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS)
+* 4608x4608	80.9	GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS)
+* 4608x4608	41.3    GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS)
+* 4608x4608	20.7	GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS)
+*
+* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
+* 
+* 13824x13824  234.5	GFLOPS with 32 threads on 16 modules (ACML:  88.5 GFLOPS) !strange thermal behavior
+* 13824x13824  241.9	GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior
+* 9216x9216    137.6	GFLOPS with  8 threads on  8 modules (ACML: 106.5 GFLOPS)
+* 4608x4608	75.7	GFLOPS with  4 threads on  4 modules (ACML:  56.3 GFLOPS)
+* 4608x4608	38.6	GFLOPS with  2 threads on  2 modules (ACML:  34.1 GFLOPS)
+* 4608x4608	19.6	GFLOPS with  1 threads on  1 modules (ACML:  18.3 GFLOPS)
+*
+*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+#define LB2_OFFSET    4096
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+#define BUFFER2	LB2_OFFSET+128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+#if defined(BULLDOZER)
+
+#define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0
+
+#define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0
+
+#else
+
+#define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0
+
+#define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0
+
+#endif
+
+
+
+
+#define	A_PR1	512
+#define	B_PR1	256
+#define	C_PR1	64
+
+.macro INIT8x3
+	vxorpd	%xmm4 , %xmm4 , %xmm4
+	vxorpd	%xmm5 , %xmm5 , %xmm5
+	vxorpd	%xmm6 , %xmm6 , %xmm6
+	vxorpd	%xmm7 , %xmm7 , %xmm7
+	vxorpd	%xmm8 , %xmm8 , %xmm8
+	vxorpd	%xmm9 , %xmm9 , %xmm9
+	vxorpd	%xmm10, %xmm10, %xmm10
+	vxorpd	%xmm11, %xmm11, %xmm11
+	vxorpd	%xmm12, %xmm12, %xmm12
+	vxorpd	%xmm13, %xmm13, %xmm13
+	vxorpd	%xmm14, %xmm14, %xmm14
+	vxorpd	%xmm15, %xmm15, %xmm15
+.endm
+
+.macro KERNEL8x3_INIT 
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmovups 	-16 * SIZE(AO), %xmm0
+	prefetcht0	A_PR1(AO)
+	vmulpd  	%xmm1,%xmm0,%xmm4
+	vmovddup	-11 * SIZE(BO), %xmm2
+	vmulpd  	%xmm2,%xmm0,%xmm5
+	vmovddup	-10 * SIZE(BO), %xmm3
+	vmulpd  	%xmm3,%xmm0,%xmm6
+	vmovups 	-14 * SIZE(AO), %xmm0
+	vmulpd  	%xmm1,%xmm0,%xmm7
+	vmulpd  	%xmm2,%xmm0,%xmm8
+	vmulpd  	%xmm3,%xmm0,%xmm9
+	vmovups 	-12 * SIZE(AO), %xmm0
+	vmulpd  	%xmm1,%xmm0,%xmm10
+	vmulpd  	%xmm2,%xmm0,%xmm11
+	addq		$ 3 * SIZE, BO
+	vmulpd  	%xmm3,%xmm0,%xmm12
+	vmovups 	-10 * SIZE(AO), %xmm0
+	vmulpd  	%xmm1,%xmm0,%xmm13
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmulpd  	%xmm2,%xmm0,%xmm14
+	vmovddup	-11 * SIZE(BO), %xmm2
+	vmulpd  	%xmm3,%xmm0,%xmm15
+.endm
+
+
+.macro KERNEL8x3_M1 
+	vmovups 	-16 * SIZE(AO), %xmm0
+	prefetcht0	A_PR1(AO)
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
+	vmovups 	-14 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
+	vmovups 	-12 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
+	vmovups 	-10 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
+	vmovddup	-12 * SIZE(BO), %xmm1
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
+	vmovddup	-11 * SIZE(BO), %xmm2
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
+.endm
+
+.macro KERNEL8x3_M2 
+	vmovups 	 -8 * SIZE(AO), %xmm0
+	prefetcht0	A_PR1+64(AO)
+	vmovddup	-10 * SIZE(BO), %xmm3
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
+	vmovups 	 -6 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
+	vmovups 	 -4 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
+	vmovups 	 -2 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
+	vmovddup	 -9 * SIZE(BO), %xmm1
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
+	vmovddup	 -8 * SIZE(BO), %xmm2
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
+.endm
+
+
+.macro KERNEL8x3_M3 
+	vmovups 	  0 * SIZE(AO), %xmm0
+	prefetcht0	A_PR1+128(AO)
+	vmovddup	 -7 * SIZE(BO), %xmm3
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
+	vmovups 	  2 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
+	vmovups 	  4 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
+	vmovups 	  6 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
+	vmovddup	 -6 * SIZE(BO), %xmm1
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
+	vmovddup	 -5 * SIZE(BO), %xmm2
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
+.endm
+
+.macro KERNEL8x3_M4 
+	vmovups 	  8 * SIZE(AO), %xmm0
+	prefetcht0	A_PR1+192(AO)
+	vmovddup	 -4 * SIZE(BO), %xmm3
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
+	vmovups 	 10 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
+	vmovups 	 12 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
+	vmovups 	 14 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
+	vmovddup	 -3 * SIZE(BO), %xmm1
+	addq		$ 32 * SIZE, AO
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
+	vmovddup	 -2 * SIZE(BO), %xmm2
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
+.endm
+
+.macro KERNEL8x3_M5 
+	vmovups 	-16 * SIZE(AO), %xmm0
+	prefetcht0	A_PR1(AO)
+	vmovddup	 -1 * SIZE(BO), %xmm3
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
+	vmovups 	-14 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
+	vmovups 	-12 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
+	vmovups 	-10 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
+	vmovddup	  0 * SIZE(BO), %xmm1
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
+	vmovddup	  1 * SIZE(BO), %xmm2
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
+.endm
+
+.macro KERNEL8x3_M6 
+	vmovups 	 -8 * SIZE(AO), %xmm0
+	prefetcht0	A_PR1+64(AO)
+	vmovddup	  2 * SIZE(BO), %xmm3
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
+	vmovups 	 -6 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
+	vmovups 	 -4 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
+	vmovups 	 -2 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
+	vmovddup	  3 * SIZE(BO), %xmm1
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
+	vmovddup	  4 * SIZE(BO), %xmm2
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
+.endm
+
+
+.macro KERNEL8x3_M7 
+	vmovups 	  0 * SIZE(AO), %xmm0
+	prefetcht0	A_PR1+128(AO)
+	vmovddup	  5 * SIZE(BO), %xmm3
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
+	vmovups 	  2 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
+	vmovups 	  4 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
+	vmovups 	  6 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
+	vmovddup	  6 * SIZE(BO), %xmm1
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
+	vmovddup	  7 * SIZE(BO), %xmm2
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
+.endm
+
+.macro KERNEL8x3_M8 
+	vmovups 	  8 * SIZE(AO), %xmm0
+	prefetcht0	A_PR1+192(AO)
+	vmovddup	  8 * SIZE(BO), %xmm3
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
+	vmovups 	 10 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
+	vmovups 	 12 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
+	vmovups 	 14 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
+	vmovddup	  9 * SIZE(BO), %xmm1
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
+	vmovddup	 10 * SIZE(BO), %xmm2
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
+	vmovddup	 11 * SIZE(BO), %xmm3
+	addq		$ 32 * SIZE, AO
+	addq		$ 24 * SIZE, BO
+.endm
+
+
+.macro KERNEL8x3_E 
+	vmovups 	  8 * SIZE(AO), %xmm0
+	prefetcht0	A_PR1+192(AO)
+	vmovddup	  8 * SIZE(BO), %xmm3
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
+	vmovups 	 10 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
+	vmovups 	 12 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
+	vmovups 	 14 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
+	addq		$ 32 * SIZE, AO
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
+	addq		$ 21 * SIZE, BO
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
+.endm
+
+.macro KERNEL8x3_SUBN 
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmovups 	-16 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm4 )
+	vmovddup	-11 * SIZE(BO), %xmm2
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm5 )
+	vmovddup	-10 * SIZE(BO), %xmm3
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm6 )
+	vmovups 	-14 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm7 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm8 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm9 )
+	vmovups 	-12 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm10 )
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm11 )
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm12 )
+	vmovups 	-10 * SIZE(AO), %xmm0
+	VFMADD231PD_(  	%xmm1,%xmm0,%xmm13 )
+	addq		$ 3 * SIZE, BO
+	VFMADD231PD_(  	%xmm2,%xmm0,%xmm14 )
+	addq		$ 8 * SIZE, AO
+	VFMADD231PD_(  	%xmm3,%xmm0,%xmm15 )
+.endm
+
+.macro	SAVE8x3
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+	vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+	vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
+	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm12, 4 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm15, 6 * SIZE(CO1, LDC, 2)
+
+	prefetcht0	 C_PR1(CO1)
+	prefetcht0	 C_PR1(CO1,LDC)
+	prefetcht0	 C_PR1(CO1,LDC,2)
+
+	addq	$ 8 * SIZE, CO1		# coffset += 8
+.endm
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x3_1(xx) \
+	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL4x3_2(xx) \
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL4x3_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL4x3_4(xx) \
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  4 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	  5 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	addq	$12, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x3_SUB(xx) \
+	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddpd  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+
+
+
+
+/*******************************************************************************************/
+
+#define KERNEL2x3_1(xx) \
+	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL2x3_2(xx) \
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL2x3_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL2x3_4(xx) \
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  4 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	  5 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	addq	$12, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x3_SUB(xx) \
+	vmovddup	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddpd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x3_1(xx) \
+	vmovsd	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_2(xx) \
+	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_3(xx) \
+	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovsd	  2 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_4(xx) \
+	vmovsd	  3 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	  4 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovsd	  5 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	addq	$12, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x3_SUB(xx) \
+	vmovsd	 -6 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -5 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm3 ;\
+	vfmaddsd  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+
+
+/*******************************************************************************************
+* 2 lines of N
+*******************************************************************************************/
+
+#define KERNEL8x2_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,8)	;\
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL8x2_2(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,8)	;\
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL8x2_3(xx) \
+	prefetcht0	A_PR1+128(AO,%rax,8)	;\
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	  0 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	  2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	  4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	  6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL8x2_4(xx) \
+	prefetcht0	A_PR1+192(AO,%rax,8)	;\
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	  8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	 12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	 14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	addq	$8, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x2_SUB(xx) \
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddpd  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddpd  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x2_1(xx) \
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL4x2_2(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL4x2_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL4x2_4(xx) \
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	addq	$8, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x2_SUB(xx) \
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddpd  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL2x2_1(xx) \
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL2x2_2(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL2x2_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL2x2_4(xx) \
+	vmovddup	  2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	  3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	addq	$8, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x2_SUB(xx) \
+	vmovddup	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovddup	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddpd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x2_1(xx) \
+	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_2(xx) \
+	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_3(xx) \
+	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	  1 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_4(xx) \
+	vmovsd	  2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	  3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	addq	$8, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x2_SUB(xx) \
+	vmovsd	 -4 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovsd	 -3 * SIZE(BO, BI, 8), %xmm2 ;\
+	vfmaddsd  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+
+
+/*******************************************************************************************
+* 1 line of N
+*******************************************************************************************/
+
+#define KERNEL8x1_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,8)	;\
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL8x1_2(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,8)	;\
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL8x1_3(xx) \
+	prefetcht0	A_PR1+128(AO,%rax,8)	;\
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	  0 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	  2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	  4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	  6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL8x1_4(xx) \
+	prefetcht0	A_PR1+192(AO,%rax,8)	;\
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	  8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	 12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	 14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	addq	$4, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x1_SUB(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x1_1(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL4x1_2(xx) \
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL4x1_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 -6 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL4x1_4(xx) \
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 -2 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	addq	$4, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x1_SUB(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL2x1_1(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL2x1_2(xx) \
+	vmovddup	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL2x1_3(xx) \
+	vmovddup	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-12 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL2x1_4(xx) \
+	vmovddup	  1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-10 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	addq	$4, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x1_SUB(xx) \
+	vmovddup	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddpd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x1_1(xx) \
+	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_2(xx) \
+	vmovsd	 -1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-15 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_3(xx) \
+	vmovsd	  0 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-14 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_4(xx) \
+	vmovsd	  1 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-13 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	addq	$4, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x1_SUB(xx) \
+	vmovsd	 -2 * SIZE(BO, BI, 8), %xmm1 ;\
+	vmovsd 	-16 * SIZE(AO, %rax, 8), %xmm0 ;\
+	vfmaddsd  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $6,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L2_0
+	ALIGN_4
+
+.L6_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $1,%rax                 // K * 2
+        movq    B, BO1
+        leaq    (B,%rax,8), BO2         // next offset to BO2
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+        sarq    $2, %rax                // K / 4
+        jz      .L6_02a
+        ALIGN_4
+
+.L6_02:
+	prefetcht0 B_PR1(BO1)
+	prefetcht0 B_PR1(BO2)
+	prefetchw  B_PR1(BO)
+	vmovups	      (BO1), %xmm0
+	vmovups	2*SIZE(BO1), %xmm2
+	vmovups	4*SIZE(BO1), %xmm4
+	vmovups	6*SIZE(BO1), %xmm6
+	vmovsd        (BO2), %xmm1
+	vmovsd  2*SIZE(BO2), %xmm3
+	vmovsd  4*SIZE(BO2), %xmm5
+	vmovsd  6*SIZE(BO2), %xmm7
+	vmovups	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovups	%xmm2, 3*SIZE(BO)
+	vmovsd	%xmm3, 5*SIZE(BO)
+	vmovups	%xmm4, 6*SIZE(BO)
+	vmovsd	%xmm5, 8*SIZE(BO)
+	vmovups	%xmm6, 9*SIZE(BO)
+	vmovsd	%xmm7,11*SIZE(BO)
+	addq	$ 8*SIZE,BO1
+	addq	$ 8*SIZE,BO2
+	addq	$ 12*SIZE,BO
+	decq	%rax
+	jnz	.L6_02
+
+.L6_02a:
+
+	movq	K, %rax
+	andq	$3, %rax		// K % 4
+	jz	.L6_02c
+	ALIGN_4
+
+.L6_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovsd  (BO2), %xmm1
+	vmovups	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO2
+	addq	$ 3*SIZE,BO
+	decq	%rax
+	jnz	.L6_02b
+
+.L6_02c:
+
+	movq	K, %rax
+	salq	$1,%rax			// K * 2
+	leaq	(B,%rax,8), BO1		// next offset to BO1
+	leaq	(BO1,%rax,8), BO2	// next offset to BO1
+	leaq    BUFFER2, BO		// second buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// k / 4
+	jz	.L6_03a
+	ALIGN_4
+
+
+.L6_03:
+
+	prefetcht0 B_PR1(BO2)
+	prefetchw  B_PR1(BO)
+	vmovups	      (BO2), %xmm0
+	vmovups	2*SIZE(BO2), %xmm2
+	vmovups	4*SIZE(BO2), %xmm4
+	vmovups	6*SIZE(BO2), %xmm6
+	vmovsd  1*SIZE(BO1), %xmm1
+	vmovsd  3*SIZE(BO1), %xmm3
+	vmovsd  5*SIZE(BO1), %xmm5
+	vmovsd  7*SIZE(BO1), %xmm7
+	vmovsd	%xmm1, 0*SIZE(BO)
+	vmovups	%xmm0, 1*SIZE(BO)
+	vmovsd	%xmm3, 3*SIZE(BO)
+	vmovups	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm5, 6*SIZE(BO)
+	vmovups	%xmm4, 7*SIZE(BO)
+	vmovsd	%xmm7, 9*SIZE(BO)
+	vmovups	%xmm6,10*SIZE(BO)
+	addq	$ 8*SIZE,BO1
+	addq	$ 8*SIZE,BO2
+	addq	$ 12*SIZE,BO
+	decq	%rax
+	jnz	.L6_03
+
+.L6_03a:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L6_03c
+        ALIGN_4
+
+
+.L6_03b:
+
+	vmovsd	  1*SIZE(BO1), %xmm0
+	vmovups  	(BO2), %xmm1
+	vmovsd	%xmm0,       (BO)
+	vmovups %xmm1, 1*SIZE(BO)
+	addq	$ 2*SIZE,BO1
+	addq	$ 2*SIZE,BO2
+	addq	$ 3*SIZE,BO
+	decq	%rax
+	jnz	.L6_03b
+
+
+.L6_03c:
+
+	movq	BO2, B			// next offset of B
+
+.L6_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L6_20
+
+	ALIGN_4
+
+.L6_11:
+
+        leaq    BUFFER1, BO             // first buffer to BO
+	addq	$12 * SIZE, BO
+        movq    K, %rax
+	sarq	$3, %rax			//  K / 8
+	cmpq	$3, %rax
+	jl	.L6_13
+
+	prefetcht0 B_PR1(BO)
+	prefetcht0 B_PR1+64(BO)
+	prefetcht0 B_PR1+128(BO)
+	KERNEL8x3_INIT
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_M8
+
+	subq	$2, %rax
+
+	ALIGN_5
+
+.L6_12:
+
+	prefetcht0 B_PR1-24(BO)
+	prefetcht0 B_PR1+40(BO)
+	KERNEL8x3_M1
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	prefetcht0 B_PR1+104(BO)
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_M8
+
+	dec	%rax
+	jne	.L6_12
+
+.L6_12_E:
+
+	prefetcht0 B_PR1(BO)
+	prefetcht0 B_PR1+64(BO)
+	KERNEL8x3_M1
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_E
+
+	jmp	.L6_16
+
+.L6_13:
+
+	test $2, %rax
+	jz	.L6_14
+
+	KERNEL8x3_INIT
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_M8
+
+	KERNEL8x3_M1
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_E
+
+	jmp	.L6_16
+
+
+.L6_14:
+
+	test $1, %rax
+	jz	.L6_15
+	
+	KERNEL8x3_INIT
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_E
+
+
+	jmp	.L6_16
+
+.L6_15:
+
+	INIT8x3
+
+.L6_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_19
+
+	ALIGN_4
+
+.L6_17:
+
+	KERNEL8x3_SUBN
+	dec	%rax
+	jne	.L6_17
+	ALIGN_4
+
+
+.L6_19:
+
+	SAVE8x3
+
+	decq	I			# i --
+	jg	.L6_11
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L7_10		// to next 3 lines of N
+
+	testq	$4, M		
+	jz	.L6_30
+
+	ALIGN_4
+
+.L6_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_22:
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L6_26
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L6_26
+
+	jmp	.L6_22
+	ALIGN_4
+
+.L6_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_27:
+
+	KERNEL4x3_SUB(xxx)
+	addq	$3, BI
+	addq	$4, %rax
+	jl	.L6_27
+	ALIGN_4
+
+
+.L6_29:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
+
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L6_30:
+	testq	$2, M		
+	jz	.L6_40
+
+	ALIGN_4
+
+.L6_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_32:
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L6_36
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L6_36
+
+	jmp	.L6_32
+	ALIGN_4
+
+.L6_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_37:
+
+	KERNEL2x3_SUB(xxx)
+	addq	$3, BI
+	addq	$2, %rax
+	jl	.L6_37
+	ALIGN_4
+
+
+.L6_39:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L6_40:
+	testq	$1, M		
+	jz	.L7_10		// to next 3 lines of N
+
+	ALIGN_4
+
+.L6_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_42:
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L6_46
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L6_46
+
+	jmp	.L6_42
+	ALIGN_4
+
+.L6_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_47:
+
+	KERNEL1x3_SUB(xxx)
+	addq	$3, BI
+	addq	$1, %rax
+	jl	.L6_47
+	ALIGN_4
+
+
+.L6_49:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddsd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
+
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+/***************************************************************************************************************/
+
+.L7_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L7_20
+	ALIGN_4
+
+.L7_11:
+
+        leaq    BUFFER2, BO             // first buffer to BO
+	addq	$12 * SIZE, BO
+        movq    K, %rax
+	sarq	$3, %rax			//  K / 8
+	cmpq	$3, %rax
+	jl	.L7_13
+
+	prefetcht0 B_PR1(BO)
+	prefetcht0 B_PR1+64(BO)
+	prefetcht0 B_PR1+128(BO)
+	KERNEL8x3_INIT
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_M8
+
+	subq	$2, %rax
+
+	ALIGN_5
+
+.L7_12:
+
+	prefetcht0 B_PR1-24(BO)
+	prefetcht0 B_PR1+40(BO)
+	KERNEL8x3_M1
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	prefetcht0 B_PR1+104(BO)
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_M8
+
+	dec	%rax
+	jne	.L7_12
+
+.L7_12_E:
+
+	prefetcht0 B_PR1(BO)
+	prefetcht0 B_PR1+64(BO)
+	KERNEL8x3_M1
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_E
+
+	jmp	.L7_16
+
+
+
+.L7_13:
+
+	test $2, %rax
+	jz	.L7_14
+
+	KERNEL8x3_INIT
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_M8
+
+	KERNEL8x3_M1
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_E
+
+	jmp	.L7_16
+
+
+.L7_14:
+
+	test $1, %rax
+	jz	.L7_15
+	
+	KERNEL8x3_INIT
+	KERNEL8x3_M2
+	KERNEL8x3_M3
+	KERNEL8x3_M4
+	KERNEL8x3_M5
+	KERNEL8x3_M6
+	KERNEL8x3_M7
+	KERNEL8x3_E
+
+	jmp	.L7_16
+
+
+
+.L7_15:
+
+	INIT8x3
+
+.L7_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_19
+
+
+	ALIGN_4
+
+.L7_17:
+
+	KERNEL8x3_SUBN
+	dec	%rax
+	jne	.L7_17
+	ALIGN_4
+
+
+.L7_19:
+
+	SAVE8x3
+
+	decq	I			# i --
+	jg	.L7_11
+	ALIGN_4	
+
+.L7_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L7_60		// to next 6 lines of N
+
+	testq	$4, M		
+	jz	.L7_30
+
+	ALIGN_4
+
+.L7_21:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_22:
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L7_26
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L7_26
+
+	jmp	.L7_22
+	ALIGN_4
+
+.L7_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_27:
+
+	KERNEL4x3_SUB(xxx)
+	addq	$3, BI
+	addq	$4, %rax
+	jl	.L7_27
+	ALIGN_4
+
+
+.L7_29:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 2 * SIZE(CO1, LDC, 2)
+
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L7_30:
+	testq	$2, M		
+	jz	.L7_40
+
+	ALIGN_4
+
+.L7_31:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_32:
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L7_36
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L7_36
+
+	jmp	.L7_32
+	ALIGN_4
+
+.L7_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_37:
+
+	KERNEL2x3_SUB(xxx)
+	addq	$3, BI
+	addq	$2, %rax
+	jl	.L7_37
+	ALIGN_4
+
+
+.L7_39:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+	
+
+
+
+
+.L7_40:
+	testq	$1, M		
+	jz	.L7_60		// to next 6 lines of N
+
+	ALIGN_4
+
+.L7_41:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+
+	andq	$-8, %rax
+	je	.L7_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_42:
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L7_46
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L7_46
+
+	jmp	.L7_42
+	ALIGN_4
+
+.L7_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_47:
+
+	KERNEL1x3_SUB(xxx)
+	addq	$3, BI
+	addq	$1, %rax
+	jl	.L7_47
+	ALIGN_4
+
+
+.L7_49:
+
+	vmovddup	ALPHA, %xmm0
+
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddsd 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
+
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+
+.L7_60:
+
+	decq	J			// j --
+	jg	.L6_01
+
+
+.L2_0:
+	cmpq	$0, Nmod6		// N % 6 == 0
+	je	.L999
+
+/************************************************************************************************
+* Loop for Nmod6 / 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	sarq	$1, J			// j = j / 2
+	je	.L1_0
+	ALIGN_4
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_16
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL8x2_SUB(xxx)
+	addq	$2, BI
+	addq	$8, %rax
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
+	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
+
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$4, M		
+	jz	.L2_30
+
+	ALIGN_4
+
+.L2_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB(xxx)
+	addq	$2, BI
+	addq	$4, %rax
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB(xxx)
+	addq	$2, BI
+	addq	$2, %rax
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+	
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	addq	$2, BI
+	addq	$1, %rax
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+	
+.L2_60:
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_16
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL8x1_SUB(xxx)
+	addq	$1, BI
+	addq	$8, %rax
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L999
+
+	testq	$4, M		
+	jz	.L1_30
+
+	ALIGN_4
+
+.L1_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB(xxx)
+	addq	$1, BI
+	addq	$4, %rax
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB(xxx)
+	addq	$1, BI
+	addq	$2, %rax
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+
+	vmovups	%xmm4 ,  	(CO1)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+	
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	addq	$1, BI
+	addq	$1, %rax
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovddup	ALPHA, %xmm0
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+
+	vmovsd	%xmm4 ,  	(CO1)
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 2
+        movq    %rax, Ndiv6             //    N / 2
+        movq    %rdx, Nmod6             //    N % 2
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+.L2_0:
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+#else
+        movq    KK, %rax
+	leaq	BUFFER1, BO		// first buffer to BO
+	addq	$4 * SIZE, BO
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_16
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL8x2_SUB(xxx)
+	addq	$2, BI
+	addq	$8, %rax
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+	vmulpd	%xmm0, %xmm7,%xmm7
+	vmulpd	%xmm0, %xmm10,%xmm10
+	vmulpd	%xmm0, %xmm13,%xmm13
+
+	vmulpd	%xmm0, %xmm5,%xmm5
+	vmulpd	%xmm0, %xmm8,%xmm8
+	vmulpd	%xmm0, %xmm11,%xmm11
+	vmulpd	%xmm0, %xmm14,%xmm14
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 4 * SIZE(CO1, LDC)
+	vmovups	%xmm14, 6 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$4, M		
+	jz	.L2_30
+
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB(xxx)
+	addq	$2, BI
+	addq	$4, %rax
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+	vmulpd	%xmm0, %xmm7,%xmm7
+
+	vmulpd	%xmm0, %xmm5,%xmm5
+	vmulpd	%xmm0, %xmm8,%xmm8
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 2 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB(xxx)
+	addq	$2, BI
+	addq	$2, %rax
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+	vmulpd	%xmm0, %xmm5,%xmm5
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+	
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        leaq    (AO, %rax, 8), AO
+#endif
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	addq	$2, BI
+	addq	$1, %rax
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddsd 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+#else
+	vmulsd	%xmm0, %xmm4,%xmm4
+	vmulsd	%xmm0, %xmm5,%xmm5
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, 8), BO
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$3, I			// i = (m >> 3)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_16
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL8x1_SUB(xxx)
+	addq	$1, BI
+	addq	$8, %rax
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+	vmulpd	%xmm0, %xmm7,%xmm7
+	vmulpd	%xmm0, %xmm10,%xmm10
+	vmulpd	%xmm0, %xmm13,%xmm13
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+	vmovups	%xmm10, 4 * SIZE(CO1)
+	vmovups	%xmm13, 6 * SIZE(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+	addq	$8 * SIZE, CO1		# coffset += 8
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$7, M
+	jz	.L999
+
+	testq	$4, M		
+	jz	.L1_30
+
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB(xxx)
+	addq	$1, BI
+	addq	$4, %rax
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+	vmulpd	%xmm0, %xmm7,%xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 2 * SIZE(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB(xxx)
+	addq	$1, BI
+	addq	$2, %rax
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddpd 	 (CO1),%xmm0, %xmm4,%xmm4
+
+#else
+	vmulpd	%xmm0, %xmm4,%xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+	
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, 8), AO
+	leaq	(BO, BI, 8), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	addq	$1, BI
+	addq	$1, %rax
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovddup	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddsd 	 (CO1),%xmm0, %xmm4,%xmm4
+
+#else
+	vmulsd	%xmm0, %xmm4,%xmm4
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, 8), BO
+        leaq    (AO, %rax, 8), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+#endif
diff --git a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
index 9cc27184d..b31a934f2 100644
--- a/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
+++ b/kernel/x86_64/sgemm_kernel_16x2_bulldozer.S
@@ -1,5231 +1,5231 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 256
-
-#define OLD_A		40 + STACKSIZE(%rsp)
-#define OLD_B		48 + STACKSIZE(%rsp)
-#define OLD_C		56 + STACKSIZE(%rsp)
-#define OLD_LDC		64 + STACKSIZE(%rsp)
-#define OLD_OFFSET	72 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-#define LB2_OFFSET    4096
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA	 48(%rsp)
-#define OFFSET	 56(%rsp)
-#define KK	 64(%rsp)
-#define KKK	 72(%rsp)
-#define BUFFER1	           128(%rsp)
-#define BUFFER2	LB2_OFFSET+128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-
-#define	A_PR1	384
-#define	B_PR1	192
-
-/*******************************************************************************************
-* 3 lines of N
-*******************************************************************************************/
-
-#define KERNEL16x3_1(xx) \
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL16x3_2(xx) \
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL16x3_3(xx) \
-	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL16x3_4(xx) \
-	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	addq	$12, BI				  ;\
-	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	addq	$64, %rax 			  ;\
-	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL16x3_SUB(xx) \
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL8x3_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL8x3_2(xx) \
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL8x3_3(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL8x3_4(xx) \
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	addq	$12, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x3_SUB(xx) \
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x3_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL4x3_2(xx) \
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL4x3_3(xx) \
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL4x3_4(xx) \
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	addq	$12, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x3_SUB(xx) \
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL2x3_1(xx) \
-	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-
-#define KERNEL2x3_2(xx) \
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-
-#define KERNEL2x3_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-
-#define KERNEL2x3_4(xx) \
-	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	addq	$12, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x3_SUB(xx) \
-	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x3_1(xx) \
-	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_2(xx) \
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_4(xx) \
-	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	addq	$12, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x3_SUB(xx) \
-	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 2 lines of N
-*******************************************************************************************/
-
-#define KERNEL16x2_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL16x2_2(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL16x2_3(xx) \
-	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL16x2_4(xx) \
-	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	addq	$8, BI				  ;\
-	addq	$64, %rax 			  ;\
-
-#define KERNEL16x2_SUB(xx) \
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL8x2_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL8x2_2(xx) \
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL8x2_3(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL8x2_4(xx) \
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	addq	$8, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x2_SUB(xx) \
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x2_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL4x2_2(xx) \
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL4x2_3(xx) \
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL4x2_4(xx) \
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	addq	$8, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x2_SUB(xx) \
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL2x2_1(xx) \
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-
-#define KERNEL2x2_2(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-
-#define KERNEL2x2_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-
-#define KERNEL2x2_4(xx) \
-	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	addq	$8, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x2_SUB(xx) \
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x2_1(xx) \
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_2(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_4(xx) \
-	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	addq	$8, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x2_SUB(xx) \
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 1 line of N
-*******************************************************************************************/
-
-#define KERNEL16x1_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL16x1_2(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL16x1_3(xx) \
-	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL16x1_4(xx) \
-	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	addq	$4, BI				  ;\
-	addq	$64, %rax 			  ;\
-
-#define KERNEL16x1_SUB(xx) \
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL8x1_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL8x1_2(xx) \
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL8x1_3(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL8x1_4(xx) \
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	addq	$4, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x1_SUB(xx) \
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x1_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL4x1_2(xx) \
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL4x1_3(xx) \
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL4x1_4(xx) \
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	addq	$4, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x1_SUB(xx) \
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL2x1_1(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-
-#define KERNEL2x1_2(xx) \
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-
-#define KERNEL2x1_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-
-#define KERNEL2x1_4(xx) \
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	addq	$4, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x1_SUB(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x1_1(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_2(xx) \
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_4(xx) \
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	addq	$4, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x1_SUB(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-/*******************************************************************************************/
-
-#if !defined(TRMMKERNEL)
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $6,  %rdi
-        divq    %rdi                    //    N / 6
-        movq    %rax, Ndiv6             //    N / 6
-        movq    %rdx, Nmod6             //    N % 6
-
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L2_0
-	ALIGN_4
-
-.L6_01:
-        // copy to sub buffer
-        movq    K, %rax
-        salq    $1,%rax                 // K * 2 ; read 2 values
-        movq    B, BO1
-        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
-        leaq    BUFFER1, BO             // first buffer to BO
-        movq    K, %rax
-	sarq	$3 , %rax		// K / 8
-	jz	.L6_01a_2
-        ALIGN_4
-
-.L6_01a_1:
-
-        prefetcht0 512(BO1)
-        prefetcht0 512(BO2)
-        prefetchw  512(BO)
-
-	vmovsd	0 * SIZE(BO1), %xmm0
-	vmovsd	2 * SIZE(BO1), %xmm2
-	vmovsd	4 * SIZE(BO1), %xmm4
-	vmovsd	6 * SIZE(BO1), %xmm6
-	vmovss  0 * SIZE(BO2), %xmm1
-	vmovss  2 * SIZE(BO2), %xmm3
-	vmovss  4 * SIZE(BO2), %xmm5
-	vmovss  6 * SIZE(BO2), %xmm7
-	vmovsd	%xmm0, 0*SIZE(BO)
-	vmovss	%xmm1, 2*SIZE(BO)
-	vmovsd	%xmm2, 3*SIZE(BO)
-	vmovss	%xmm3, 5*SIZE(BO)
-	vmovsd	%xmm4, 6*SIZE(BO)
-	vmovss	%xmm5, 8*SIZE(BO)
-	vmovsd	%xmm6, 9*SIZE(BO)
-	vmovss	%xmm7,11*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-
-	vmovsd	0 * SIZE(BO1), %xmm0
-	vmovsd	2 * SIZE(BO1), %xmm2
-	vmovsd	4 * SIZE(BO1), %xmm4
-	vmovsd	6 * SIZE(BO1), %xmm6
-	vmovss  0 * SIZE(BO2), %xmm1
-	vmovss  2 * SIZE(BO2), %xmm3
-	vmovss  4 * SIZE(BO2), %xmm5
-	vmovss  6 * SIZE(BO2), %xmm7
-	vmovsd	%xmm0, 0*SIZE(BO)
-	vmovss	%xmm1, 2*SIZE(BO)
-	vmovsd	%xmm2, 3*SIZE(BO)
-	vmovss	%xmm3, 5*SIZE(BO)
-	vmovsd	%xmm4, 6*SIZE(BO)
-	vmovss	%xmm5, 8*SIZE(BO)
-	vmovsd	%xmm6, 9*SIZE(BO)
-	vmovss	%xmm7,11*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-
-	decq	%rax
-	jnz	.L6_01a_1
-
-
-
-.L6_01a_2:
-
-	movq    K, %rax
-        andq    $7, %rax                // K % 8
-        jz      .L6_02c
-        ALIGN_4
-
-
-.L6_02b:
-
-	vmovsd	0 * SIZE(BO1), %xmm0
-	vmovss  0 * SIZE(BO2), %xmm2
-	vmovsd	%xmm0, 0*SIZE(BO)
-	vmovss	%xmm2, 2*SIZE(BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO2
-	addq	$3*SIZE,BO
-	decq	%rax
-	jnz	.L6_02b
-
-.L6_02c:
-
-	movq	K, %rax
-	salq	$1,%rax			// K * 2
-	leaq	(B,%rax, SIZE), BO1	// next offset to BO1
-	leaq	(BO1,%rax, SIZE), BO2	// next offset to BO2
-	leaq    BUFFER2, BO		// second buffer to BO
-	movq	K, %rax
-	sarq	$3 , %rax		// K / 8
-	jz	.L6_02c_2
-	ALIGN_4
-
-.L6_02c_1:
-
-	prefetcht0 512(BO2)
-        prefetchw  512(BO)
-
-	vmovsd	0 * SIZE(BO2), %xmm0
-	vmovsd	2 * SIZE(BO2), %xmm2
-	vmovsd	4 * SIZE(BO2), %xmm4
-	vmovsd	6 * SIZE(BO2), %xmm6
-	vmovss  1 * SIZE(BO1), %xmm1
-	vmovss  3 * SIZE(BO1), %xmm3
-	vmovss  5 * SIZE(BO1), %xmm5
-	vmovss  7 * SIZE(BO1), %xmm7
-	vmovss	%xmm1, 0*SIZE(BO)
-	vmovsd	%xmm0, 1*SIZE(BO)
-	vmovss	%xmm3, 3*SIZE(BO)
-	vmovsd	%xmm2, 4*SIZE(BO)
-	vmovss	%xmm5, 6*SIZE(BO)
-	vmovsd	%xmm4, 7*SIZE(BO)
-	vmovss	%xmm7, 9*SIZE(BO)
-	vmovsd	%xmm6,10*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-
-
-	vmovsd	0 * SIZE(BO2), %xmm0
-	vmovsd	2 * SIZE(BO2), %xmm2
-	vmovsd	4 * SIZE(BO2), %xmm4
-	vmovsd	6 * SIZE(BO2), %xmm6
-	vmovss  1 * SIZE(BO1), %xmm1
-	vmovss  3 * SIZE(BO1), %xmm3
-	vmovss  5 * SIZE(BO1), %xmm5
-	vmovss  7 * SIZE(BO1), %xmm7
-	vmovss	%xmm1, 0*SIZE(BO)
-	vmovsd	%xmm0, 1*SIZE(BO)
-	vmovss	%xmm3, 3*SIZE(BO)
-	vmovsd	%xmm2, 4*SIZE(BO)
-	vmovss	%xmm5, 6*SIZE(BO)
-	vmovsd	%xmm4, 7*SIZE(BO)
-	vmovss	%xmm7, 9*SIZE(BO)
-	vmovsd	%xmm6,10*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-
-	decq	%rax
-	jnz	.L6_02c_1
-
-
-.L6_02c_2:
-
-	movq    K, %rax
-        andq    $7, %rax                // K % 8
-        jz      .L6_03c
-        ALIGN_4
-
-.L6_03b:
-
-	vmovss	  1*SIZE(BO1), %xmm0
-	vmovsd	  0*SIZE(BO2), %xmm1
-	vmovss	%xmm0, 0*SIZE(BO)
-	vmovsd	%xmm1, 1*SIZE(BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO2
-	addq	$3*SIZE,BO
-	decq	%rax
-	jnz	.L6_03b
-
-
-.L6_03c:
-
-	movq	BO2, B			// next offset of B
-
-.L6_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		 
-	leaq	(C, LDC, 1), C		// c += 3 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L6_20
-
-	ALIGN_4
-
-.L6_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L6_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	KERNEL16x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	je	.L6_16
-
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	je	.L6_16
-
-	jmp	.L6_12
-	ALIGN_4
-
-.L6_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_17:
-
-	KERNEL16x3_SUB(xxx)
-	addq	$3, BI
-	addq	$16, %rax
-	jl	.L6_17
-	ALIGN_4
-
-
-.L6_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-	vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-	vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
-	vmovups	%xmm14,12 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm12, 8 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm15,12 * SIZE(CO1, LDC, 2)
-
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L6_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L6_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L7_10		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L6_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L6_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L6_20_6
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L6_20_6
-
-	jmp	.L6_20_2
-	ALIGN_4
-
-.L6_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_20_7:
-
-	KERNEL8x3_SUB(xxx)
-	addq	$3, BI
-	addq	$8, %rax
-	jl	.L6_20_7
-	ALIGN_4
-
-
-.L6_20_9:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
-
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L6_21pre:
-
-	testq	$4, M		
-	jz	.L6_30
-	ALIGN_4
-
-.L6_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L6_26
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L6_26
-
-	jmp	.L6_22
-	ALIGN_4
-
-.L6_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_27:
-
-	KERNEL4x3_SUB(xxx)
-	addq	$3, BI
-	addq	$4, %rax
-	jl	.L6_27
-	ALIGN_4
-
-
-.L6_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L6_30:
-	testq	$2, M		
-	jz	.L6_40
-
-	ALIGN_4
-
-.L6_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI,SIZE)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L6_36
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,SIZE)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L6_36
-
-	jmp	.L6_32
-	ALIGN_4
-
-.L6_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_37:
-
-	KERNEL2x3_SUB(xxx)
-	addq	$3, BI
-	addq	$2, %rax
-	jl	.L6_37
-	ALIGN_4
-
-
-.L6_39:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
-	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
-	vmovss	%xmm6 ,  	(CO1, LDC, 2)
-	vmovss	%xmm12, 1 * SIZE(CO1, LDC, 2)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L6_40:
-	testq	$1, M		
-	jz	.L7_10		// to next 3 lines of N
-
-	ALIGN_4
-
-.L6_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_42:
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L6_46
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L6_46
-
-	jmp	.L6_42
-	ALIGN_4
-
-.L6_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_47:
-
-	KERNEL1x3_SUB(xxx)
-	addq	$3, BI
-	addq	$1, %rax
-	jl	.L6_47
-	ALIGN_4
-
-
-.L6_49:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm6 ,  	(CO1, LDC, 2)
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-/***************************************************************************************************************/
-
-.L7_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		 
-	leaq	(C, LDC, 1), C		// c += 3 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L7_20
-
-	ALIGN_4
-
-.L7_11:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L7_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	KERNEL16x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	je	.L7_16
-
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	je	.L7_16
-
-	jmp	.L7_12
-	ALIGN_4
-
-.L7_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_17:
-
-	KERNEL16x3_SUB(xxx)
-	addq	$3, BI
-	addq	$16, %rax
-	jl	.L7_17
-	ALIGN_4
-
-
-.L7_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-	vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-	vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
-	vmovups	%xmm14,12 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm12, 8 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm15,12 * SIZE(CO1, LDC, 2)
-
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L7_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L7_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L7_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L7_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L7_20_1:
-        leaq    BUFFER2, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L7_20_6
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L7_20_6
-
-	jmp	.L7_20_2
-	ALIGN_4
-
-.L7_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_20_7:
-
-	KERNEL8x3_SUB(xxx)
-	addq	$3, BI
-	addq	$8, %rax
-	jl	.L7_20_7
-	ALIGN_4
-
-.L7_20_9:
-
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
-
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L7_21pre:
-
-	testq	$4, M		
-	jz	.L7_30
-	ALIGN_4
-
-.L7_21:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L7_26
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L7_26
-
-	jmp	.L7_22
-	ALIGN_4
-
-.L7_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_27:
-
-	KERNEL4x3_SUB(xxx)
-	addq	$3, BI
-	addq	$4, %rax
-	jl	.L7_27
-	ALIGN_4
-
-
-.L7_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L7_30:
-	testq	$2, M		
-	jz	.L7_40
-
-	ALIGN_4
-
-.L7_31:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI,SIZE)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L7_36
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,SIZE)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L7_36
-
-	jmp	.L7_32
-	ALIGN_4
-
-.L7_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_37:
-
-	KERNEL2x3_SUB(xxx)
-	addq	$3, BI
-	addq	$2, %rax
-	jl	.L7_37
-	ALIGN_4
-
-
-.L7_39:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
-	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
-	vmovss	%xmm6 ,  	(CO1, LDC, 2)
-	vmovss	%xmm12, 1 * SIZE(CO1, LDC, 2)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L7_40:
-	testq	$1, M		
-	jz	.L7_60		// to next 3 lines of N
-
-	ALIGN_4
-
-.L7_41:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_42:
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L7_46
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L7_46
-
-	jmp	.L7_42
-	ALIGN_4
-
-.L7_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_47:
-
-	KERNEL1x3_SUB(xxx)
-	addq	$3, BI
-	addq	$1, %rax
-	jl	.L7_47
-	ALIGN_4
-
-
-.L7_49:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm6 ,  	(CO1, LDC, 2)
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-.L7_60:
-
-	decq	J			// j --
-	jg	.L6_01
-
-
-.L2_0:
-	cmpq	$0, Nmod6		// N % 6 == 0
-	je	.L999
-
-/************************************************************************************************
-* Loop for Nmod6 / 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	sarq	$1, J			// j = j / 2
-	je	.L1_0
-	ALIGN_4
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovsd	(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB(xxx)
-	addq	$2, BI
-	addq	$16, %rax
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
-	vmovups	%xmm14,12 * SIZE(CO1, LDC)
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_20_6
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB(xxx)
-	addq	$2, BI
-	addq	$8, %rax
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB(xxx)
-	addq	$2, BI
-	addq	$4, %rax
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB(xxx)
-	addq	$2, BI
-	addq	$2, %rax
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	addq	$2, BI
-	addq	$1, %rax
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-.L2_60:
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovss	(BO1), %xmm0
-	vmovss	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	je	.L1_16
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB(xxx)
-	addq	$1, BI
-	addq	$16, %rax
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_20_6
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB(xxx)
-	addq	$1, BI
-	addq	$8, %rax
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB(xxx)
-	addq	$1, BI
-	addq	$4, %rax
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-
-	vmovups	%xmm4 ,  	(CO1)
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB(xxx)
-	addq	$1, BI
-	addq	$2, %rax
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	addq	$1, BI
-	addq	$1, %rax
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-
-	vmovss	%xmm4 ,  	(CO1)
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#else
-/*************************************************************************************
-* TRMM Kernel
-*************************************************************************************/
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 6
-        movq    %rax, Ndiv6             //    N / 6
-        movq    %rdx, Nmod6             //    N % 6
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovsd	(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $2, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB(xxx)
-	addq	$2, BI
-	addq	$16, %rax
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-	vmulps	%xmm0, %xmm7,%xmm7
-	vmulps	%xmm0, %xmm10,%xmm10
-	vmulps	%xmm0, %xmm13,%xmm13
-
-	vmulps	%xmm0, %xmm5,%xmm5
-	vmulps	%xmm0, %xmm8,%xmm8
-	vmulps	%xmm0, %xmm11,%xmm11
-	vmulps	%xmm0, %xmm14,%xmm14
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
-	vmovups	%xmm14,12 * SIZE(CO1, LDC)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_20_6
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB(xxx)
-	addq	$2, BI
-	addq	$8, %rax
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-	vmulps	%xmm0, %xmm7,%xmm7
-
-	vmulps	%xmm0, %xmm5,%xmm5
-	vmulps	%xmm0, %xmm8,%xmm8
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB(xxx)
-	addq	$2, BI
-	addq	$4, %rax
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-	vmulps	%xmm0, %xmm5,%xmm5
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB(xxx)
-	addq	$2, BI
-	addq	$2, %rax
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	vmovss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
-
-#else
-	vmulss	%xmm0, %xmm4,%xmm4
-	vmulss	%xmm0, %xmm8,%xmm8
-	vmulss	%xmm0, %xmm5,%xmm5
-	vmulss	%xmm0, %xmm10,%xmm10
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	addq	$2, BI
-	addq	$1, %rax
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-#else
-	vmulss	%xmm0, %xmm4,%xmm4
-	vmulss	%xmm0, %xmm5,%xmm5
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovss	(BO1), %xmm0
-	vmovss	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $1, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	je	.L1_16
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB(xxx)
-	addq	$1, BI
-	addq	$16, %rax
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-	vmulps	%xmm0, %xmm7,%xmm7
-	vmulps	%xmm0, %xmm10,%xmm10
-	vmulps	%xmm0, %xmm13,%xmm13
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_20_6
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB(xxx)
-	addq	$1, BI
-	addq	$8, %rax
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-	vmulps	%xmm0, %xmm7,%xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB(xxx)
-	addq	$1, BI
-	addq	$4, %rax
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB(xxx)
-	addq	$1, BI
-	addq	$2, %rax
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	vmovss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-
-#else
-	vmulss	%xmm0, %xmm4,%xmm4
-	vmulss	%xmm0, %xmm8,%xmm8
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	addq	$1, BI
-	addq	$1, %rax
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-
-#else
-	vmulss	%xmm0, %xmm4,%xmm4
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-
-
-
-#endif
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+#define LB2_OFFSET    4096
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+#define BUFFER2	LB2_OFFSET+128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+
+#define	A_PR1	384
+#define	B_PR1	192
+
+/*******************************************************************************************
+* 3 lines of N
+*******************************************************************************************/
+
+#define KERNEL16x3_1(xx) \
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL16x3_2(xx) \
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL16x3_3(xx) \
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL16x3_4(xx) \
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	addq	$12, BI				  ;\
+	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	addq	$64, %rax 			  ;\
+	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL16x3_SUB(xx) \
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL8x3_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL8x3_2(xx) \
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL8x3_3(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL8x3_4(xx) \
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	addq	$12, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x3_SUB(xx) \
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x3_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL4x3_2(xx) \
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL4x3_3(xx) \
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL4x3_4(xx) \
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	addq	$12, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x3_SUB(xx) \
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL2x3_1(xx) \
+	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+
+#define KERNEL2x3_2(xx) \
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+
+#define KERNEL2x3_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+
+#define KERNEL2x3_4(xx) \
+	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	addq	$12, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x3_SUB(xx) \
+	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x3_1(xx) \
+	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_2(xx) \
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_4(xx) \
+	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	addq	$12, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x3_SUB(xx) \
+	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 2 lines of N
+*******************************************************************************************/
+
+#define KERNEL16x2_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL16x2_2(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL16x2_3(xx) \
+	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL16x2_4(xx) \
+	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	addq	$8, BI				  ;\
+	addq	$64, %rax 			  ;\
+
+#define KERNEL16x2_SUB(xx) \
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL8x2_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL8x2_2(xx) \
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL8x2_3(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL8x2_4(xx) \
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	addq	$8, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x2_SUB(xx) \
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x2_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL4x2_2(xx) \
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL4x2_3(xx) \
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL4x2_4(xx) \
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	addq	$8, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x2_SUB(xx) \
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL2x2_1(xx) \
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+
+#define KERNEL2x2_2(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+
+#define KERNEL2x2_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+
+#define KERNEL2x2_4(xx) \
+	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	addq	$8, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x2_SUB(xx) \
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x2_1(xx) \
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_2(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_4(xx) \
+	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	addq	$8, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x2_SUB(xx) \
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 1 line of N
+*******************************************************************************************/
+
+#define KERNEL16x1_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL16x1_2(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL16x1_3(xx) \
+	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL16x1_4(xx) \
+	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	addq	$4, BI				  ;\
+	addq	$64, %rax 			  ;\
+
+#define KERNEL16x1_SUB(xx) \
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL8x1_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL8x1_2(xx) \
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL8x1_3(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL8x1_4(xx) \
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	addq	$4, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x1_SUB(xx) \
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x1_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL4x1_2(xx) \
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL4x1_3(xx) \
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL4x1_4(xx) \
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	addq	$4, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x1_SUB(xx) \
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL2x1_1(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+
+#define KERNEL2x1_2(xx) \
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+
+#define KERNEL2x1_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+
+#define KERNEL2x1_4(xx) \
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	addq	$4, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x1_SUB(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x1_1(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_2(xx) \
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_4(xx) \
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	addq	$4, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x1_SUB(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $6,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L2_0
+	ALIGN_4
+
+.L6_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $1,%rax                 // K * 2 ; read 2 values
+        movq    B, BO1
+        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+	sarq	$3 , %rax		// K / 8
+	jz	.L6_01a_2
+        ALIGN_4
+
+.L6_01a_1:
+
+        prefetcht0 512(BO1)
+        prefetcht0 512(BO2)
+        prefetchw  512(BO)
+
+	vmovsd	0 * SIZE(BO1), %xmm0
+	vmovsd	2 * SIZE(BO1), %xmm2
+	vmovsd	4 * SIZE(BO1), %xmm4
+	vmovsd	6 * SIZE(BO1), %xmm6
+	vmovss  0 * SIZE(BO2), %xmm1
+	vmovss  2 * SIZE(BO2), %xmm3
+	vmovss  4 * SIZE(BO2), %xmm5
+	vmovss  6 * SIZE(BO2), %xmm7
+	vmovsd	%xmm0, 0*SIZE(BO)
+	vmovss	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 3*SIZE(BO)
+	vmovss	%xmm3, 5*SIZE(BO)
+	vmovsd	%xmm4, 6*SIZE(BO)
+	vmovss	%xmm5, 8*SIZE(BO)
+	vmovsd	%xmm6, 9*SIZE(BO)
+	vmovss	%xmm7,11*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+	vmovsd	0 * SIZE(BO1), %xmm0
+	vmovsd	2 * SIZE(BO1), %xmm2
+	vmovsd	4 * SIZE(BO1), %xmm4
+	vmovsd	6 * SIZE(BO1), %xmm6
+	vmovss  0 * SIZE(BO2), %xmm1
+	vmovss  2 * SIZE(BO2), %xmm3
+	vmovss  4 * SIZE(BO2), %xmm5
+	vmovss  6 * SIZE(BO2), %xmm7
+	vmovsd	%xmm0, 0*SIZE(BO)
+	vmovss	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 3*SIZE(BO)
+	vmovss	%xmm3, 5*SIZE(BO)
+	vmovsd	%xmm4, 6*SIZE(BO)
+	vmovss	%xmm5, 8*SIZE(BO)
+	vmovsd	%xmm6, 9*SIZE(BO)
+	vmovss	%xmm7,11*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+	decq	%rax
+	jnz	.L6_01a_1
+
+
+
+.L6_01a_2:
+
+	movq    K, %rax
+        andq    $7, %rax                // K % 8
+        jz      .L6_02c
+        ALIGN_4
+
+
+.L6_02b:
+
+	vmovsd	0 * SIZE(BO1), %xmm0
+	vmovss  0 * SIZE(BO2), %xmm2
+	vmovsd	%xmm0, 0*SIZE(BO)
+	vmovss	%xmm2, 2*SIZE(BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO2
+	addq	$3*SIZE,BO
+	decq	%rax
+	jnz	.L6_02b
+
+.L6_02c:
+
+	movq	K, %rax
+	salq	$1,%rax			// K * 2
+	leaq	(B,%rax, SIZE), BO1	// next offset to BO1
+	leaq	(BO1,%rax, SIZE), BO2	// next offset to BO2
+	leaq    BUFFER2, BO		// second buffer to BO
+	movq	K, %rax
+	sarq	$3 , %rax		// K / 8
+	jz	.L6_02c_2
+	ALIGN_4
+
+.L6_02c_1:
+
+	prefetcht0 512(BO2)
+        prefetchw  512(BO)
+
+	vmovsd	0 * SIZE(BO2), %xmm0
+	vmovsd	2 * SIZE(BO2), %xmm2
+	vmovsd	4 * SIZE(BO2), %xmm4
+	vmovsd	6 * SIZE(BO2), %xmm6
+	vmovss  1 * SIZE(BO1), %xmm1
+	vmovss  3 * SIZE(BO1), %xmm3
+	vmovss  5 * SIZE(BO1), %xmm5
+	vmovss  7 * SIZE(BO1), %xmm7
+	vmovss	%xmm1, 0*SIZE(BO)
+	vmovsd	%xmm0, 1*SIZE(BO)
+	vmovss	%xmm3, 3*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovss	%xmm5, 6*SIZE(BO)
+	vmovsd	%xmm4, 7*SIZE(BO)
+	vmovss	%xmm7, 9*SIZE(BO)
+	vmovsd	%xmm6,10*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+
+	vmovsd	0 * SIZE(BO2), %xmm0
+	vmovsd	2 * SIZE(BO2), %xmm2
+	vmovsd	4 * SIZE(BO2), %xmm4
+	vmovsd	6 * SIZE(BO2), %xmm6
+	vmovss  1 * SIZE(BO1), %xmm1
+	vmovss  3 * SIZE(BO1), %xmm3
+	vmovss  5 * SIZE(BO1), %xmm5
+	vmovss  7 * SIZE(BO1), %xmm7
+	vmovss	%xmm1, 0*SIZE(BO)
+	vmovsd	%xmm0, 1*SIZE(BO)
+	vmovss	%xmm3, 3*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovss	%xmm5, 6*SIZE(BO)
+	vmovsd	%xmm4, 7*SIZE(BO)
+	vmovss	%xmm7, 9*SIZE(BO)
+	vmovsd	%xmm6,10*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+	decq	%rax
+	jnz	.L6_02c_1
+
+
+.L6_02c_2:
+
+	movq    K, %rax
+        andq    $7, %rax                // K % 8
+        jz      .L6_03c
+        ALIGN_4
+
+.L6_03b:
+
+	vmovss	  1*SIZE(BO1), %xmm0
+	vmovsd	  0*SIZE(BO2), %xmm1
+	vmovss	%xmm0, 0*SIZE(BO)
+	vmovsd	%xmm1, 1*SIZE(BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO2
+	addq	$3*SIZE,BO
+	decq	%rax
+	jnz	.L6_03b
+
+
+.L6_03c:
+
+	movq	BO2, B			// next offset of B
+
+.L6_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L6_20
+
+	ALIGN_4
+
+.L6_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L6_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	KERNEL16x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	je	.L6_16
+
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	je	.L6_16
+
+	jmp	.L6_12
+	ALIGN_4
+
+.L6_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_17:
+
+	KERNEL16x3_SUB(xxx)
+	addq	$3, BI
+	addq	$16, %rax
+	jl	.L6_17
+	ALIGN_4
+
+
+.L6_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+	vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+	vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
+	vmovups	%xmm14,12 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm12, 8 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm15,12 * SIZE(CO1, LDC, 2)
+
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L6_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_10		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L6_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L6_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L6_20_6
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L6_20_6
+
+	jmp	.L6_20_2
+	ALIGN_4
+
+.L6_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_20_7:
+
+	KERNEL8x3_SUB(xxx)
+	addq	$3, BI
+	addq	$8, %rax
+	jl	.L6_20_7
+	ALIGN_4
+
+
+.L6_20_9:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
+
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L6_21pre:
+
+	testq	$4, M		
+	jz	.L6_30
+	ALIGN_4
+
+.L6_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L6_26
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L6_26
+
+	jmp	.L6_22
+	ALIGN_4
+
+.L6_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_27:
+
+	KERNEL4x3_SUB(xxx)
+	addq	$3, BI
+	addq	$4, %rax
+	jl	.L6_27
+	ALIGN_4
+
+
+.L6_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L6_30:
+	testq	$2, M		
+	jz	.L6_40
+
+	ALIGN_4
+
+.L6_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI,SIZE)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L6_36
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,SIZE)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L6_36
+
+	jmp	.L6_32
+	ALIGN_4
+
+.L6_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_37:
+
+	KERNEL2x3_SUB(xxx)
+	addq	$3, BI
+	addq	$2, %rax
+	jl	.L6_37
+	ALIGN_4
+
+
+.L6_39:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
+	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
+	vmovss	%xmm6 ,  	(CO1, LDC, 2)
+	vmovss	%xmm12, 1 * SIZE(CO1, LDC, 2)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L6_40:
+	testq	$1, M		
+	jz	.L7_10		// to next 3 lines of N
+
+	ALIGN_4
+
+.L6_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_42:
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L6_46
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L6_46
+
+	jmp	.L6_42
+	ALIGN_4
+
+.L6_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_47:
+
+	KERNEL1x3_SUB(xxx)
+	addq	$3, BI
+	addq	$1, %rax
+	jl	.L6_47
+	ALIGN_4
+
+
+.L6_49:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm6 ,  	(CO1, LDC, 2)
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+/***************************************************************************************************************/
+
+.L7_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L7_20
+
+	ALIGN_4
+
+.L7_11:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L7_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	KERNEL16x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	je	.L7_16
+
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	je	.L7_16
+
+	jmp	.L7_12
+	ALIGN_4
+
+.L7_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_17:
+
+	KERNEL16x3_SUB(xxx)
+	addq	$3, BI
+	addq	$16, %rax
+	jl	.L7_17
+	ALIGN_4
+
+
+.L7_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+	vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+	vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
+	vmovups	%xmm14,12 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm12, 8 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm15,12 * SIZE(CO1, LDC, 2)
+
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L7_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L7_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L7_20_1:
+        leaq    BUFFER2, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L7_20_6
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L7_20_6
+
+	jmp	.L7_20_2
+	ALIGN_4
+
+.L7_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_20_7:
+
+	KERNEL8x3_SUB(xxx)
+	addq	$3, BI
+	addq	$8, %rax
+	jl	.L7_20_7
+	ALIGN_4
+
+.L7_20_9:
+
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
+
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L7_21pre:
+
+	testq	$4, M		
+	jz	.L7_30
+	ALIGN_4
+
+.L7_21:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L7_26
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L7_26
+
+	jmp	.L7_22
+	ALIGN_4
+
+.L7_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_27:
+
+	KERNEL4x3_SUB(xxx)
+	addq	$3, BI
+	addq	$4, %rax
+	jl	.L7_27
+	ALIGN_4
+
+
+.L7_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L7_30:
+	testq	$2, M		
+	jz	.L7_40
+
+	ALIGN_4
+
+.L7_31:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI,SIZE)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L7_36
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,SIZE)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L7_36
+
+	jmp	.L7_32
+	ALIGN_4
+
+.L7_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_37:
+
+	KERNEL2x3_SUB(xxx)
+	addq	$3, BI
+	addq	$2, %rax
+	jl	.L7_37
+	ALIGN_4
+
+
+.L7_39:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
+	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
+	vmovss	%xmm6 ,  	(CO1, LDC, 2)
+	vmovss	%xmm12, 1 * SIZE(CO1, LDC, 2)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L7_40:
+	testq	$1, M		
+	jz	.L7_60		// to next 3 lines of N
+
+	ALIGN_4
+
+.L7_41:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_42:
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L7_46
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L7_46
+
+	jmp	.L7_42
+	ALIGN_4
+
+.L7_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_47:
+
+	KERNEL1x3_SUB(xxx)
+	addq	$3, BI
+	addq	$1, %rax
+	jl	.L7_47
+	ALIGN_4
+
+
+.L7_49:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm6 ,  	(CO1, LDC, 2)
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+.L7_60:
+
+	decq	J			// j --
+	jg	.L6_01
+
+
+.L2_0:
+	cmpq	$0, Nmod6		// N % 6 == 0
+	je	.L999
+
+/************************************************************************************************
+* Loop for Nmod6 / 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	sarq	$1, J			// j = j / 2
+	je	.L1_0
+	ALIGN_4
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB(xxx)
+	addq	$2, BI
+	addq	$16, %rax
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
+	vmovups	%xmm14,12 * SIZE(CO1, LDC)
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_20_6
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB(xxx)
+	addq	$2, BI
+	addq	$8, %rax
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB(xxx)
+	addq	$2, BI
+	addq	$4, %rax
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB(xxx)
+	addq	$2, BI
+	addq	$2, %rax
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	addq	$2, BI
+	addq	$1, %rax
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+.L2_60:
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	je	.L1_16
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB(xxx)
+	addq	$1, BI
+	addq	$16, %rax
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_20_6
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB(xxx)
+	addq	$1, BI
+	addq	$8, %rax
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB(xxx)
+	addq	$1, BI
+	addq	$4, %rax
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+
+	vmovups	%xmm4 ,  	(CO1)
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB(xxx)
+	addq	$1, BI
+	addq	$2, %rax
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	addq	$1, BI
+	addq	$1, %rax
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+
+	vmovss	%xmm4 ,  	(CO1)
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB(xxx)
+	addq	$2, BI
+	addq	$16, %rax
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+	vmulps	%xmm0, %xmm7,%xmm7
+	vmulps	%xmm0, %xmm10,%xmm10
+	vmulps	%xmm0, %xmm13,%xmm13
+
+	vmulps	%xmm0, %xmm5,%xmm5
+	vmulps	%xmm0, %xmm8,%xmm8
+	vmulps	%xmm0, %xmm11,%xmm11
+	vmulps	%xmm0, %xmm14,%xmm14
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
+	vmovups	%xmm14,12 * SIZE(CO1, LDC)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_20_6
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB(xxx)
+	addq	$2, BI
+	addq	$8, %rax
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+	vmulps	%xmm0, %xmm7,%xmm7
+
+	vmulps	%xmm0, %xmm5,%xmm5
+	vmulps	%xmm0, %xmm8,%xmm8
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB(xxx)
+	addq	$2, BI
+	addq	$4, %rax
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+	vmulps	%xmm0, %xmm5,%xmm5
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB(xxx)
+	addq	$2, BI
+	addq	$2, %rax
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	vmovss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
+
+#else
+	vmulss	%xmm0, %xmm4,%xmm4
+	vmulss	%xmm0, %xmm8,%xmm8
+	vmulss	%xmm0, %xmm5,%xmm5
+	vmulss	%xmm0, %xmm10,%xmm10
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	addq	$2, BI
+	addq	$1, %rax
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+#else
+	vmulss	%xmm0, %xmm4,%xmm4
+	vmulss	%xmm0, %xmm5,%xmm5
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	je	.L1_16
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB(xxx)
+	addq	$1, BI
+	addq	$16, %rax
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+	vmulps	%xmm0, %xmm7,%xmm7
+	vmulps	%xmm0, %xmm10,%xmm10
+	vmulps	%xmm0, %xmm13,%xmm13
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_20_6
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB(xxx)
+	addq	$1, BI
+	addq	$8, %rax
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+	vmulps	%xmm0, %xmm7,%xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB(xxx)
+	addq	$1, BI
+	addq	$4, %rax
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB(xxx)
+	addq	$1, BI
+	addq	$2, %rax
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	vmovss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+
+#else
+	vmulss	%xmm0, %xmm4,%xmm4
+	vmulss	%xmm0, %xmm8,%xmm8
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	addq	$1, BI
+	addq	$1, %rax
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+
+#else
+	vmulss	%xmm0, %xmm4,%xmm4
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+
+
+#endif
diff --git a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S
index 7c42f1e12..35b01de07 100644
--- a/kernel/x86_64/sgemm_kernel_16x2_piledriver.S
+++ b/kernel/x86_64/sgemm_kernel_16x2_piledriver.S
@@ -1,5258 +1,5258 @@
-/***************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-/*********************************************************************
-*
-* 2013/10/18 Saar
-*        BLASTEST               : OK
-*        CTEST                  : OK
-*        TEST                   : OK
-*
-*
-* 2013/10/29 Saar
-*
-* Parameter:
-*       UNROLL_M        16
-*       UNROLL_N        2
-*       SGEMM_P         768
-*       SGEMM_Q         192
-*       SGEMM_R         12288
-*       A_PR1           384
-*       B_PR1           192
-*
-* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
-* 
-* 6144x6144    168.2    GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 )
-* 6144x6144    162.7    GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 )
-* 6144x6144     82.0    GFLOPS with 2 threads on 2 modules (ACML:  81.4 ) (BULLDOZER:  80.3 )
-* 6144x6144     41.3    GFLOPS with 1 threads on 1 modules (ACML:  41.1 ) (BULLDOZER:  40.4 )
-*
-* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
-* 
-* 12288x12288  469.5    GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 )
-* 12288x12288  442.9    GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 )
-* 12288x12288  265.1    GFLOPS with  8 threads on  8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 )
-* 6144x6144    139.7    GFLOPS with  4 threads on  4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 )
-* 6144x6144     70.9    GFLOPS with  2 threads on  2 modules (ACML:  67.4 ) (BULLDOZER:  69.5 )
-* 6144x6144     35.6    GFLOPS with  1 threads on  1 modules (ACML:  36.1 ) (BULLDOZER:  35.1 )
-*
-*********************************************************************/
-
-
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define BO2	%r15
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 256
-
-#define OLD_A		40 + STACKSIZE(%rsp)
-#define OLD_B		48 + STACKSIZE(%rsp)
-#define OLD_C		56 + STACKSIZE(%rsp)
-#define OLD_LDC		64 + STACKSIZE(%rsp)
-#define OLD_OFFSET	72 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-#define LB2_OFFSET    4096
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA	 48(%rsp)
-#define OFFSET	 56(%rsp)
-#define KK	 64(%rsp)
-#define KKK	 72(%rsp)
-#define BUFFER1	           128(%rsp)
-#define BUFFER2	LB2_OFFSET+128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-
-
-#define	A_PR1	384
-#define	B_PR1	192
-
-/*******************************************************************************************
-* 3 lines of N
-*******************************************************************************************/
-
-#define KERNEL16x3_1(xx) \
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL16x3_2(xx) \
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL16x3_3(xx) \
-	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL16x3_4(xx) \
-	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	addq	$12, BI				  ;\
-	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	addq	$64, %rax 			  ;\
-	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-#define KERNEL16x3_SUB(xx) \
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL8x3_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL8x3_2(xx) \
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL8x3_3(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-#define KERNEL8x3_4(xx) \
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-	addq	$12, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x3_SUB(xx) \
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	nop						;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x3_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL4x3_2(xx) \
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL4x3_3(xx) \
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL4x3_4(xx) \
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	addq	$12, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x3_SUB(xx) \
-	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL2x3_1(xx) \
-	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-
-#define KERNEL2x3_2(xx) \
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-
-#define KERNEL2x3_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-
-#define KERNEL2x3_4(xx) \
-	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-	addq	$12, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x3_SUB(xx) \
-	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x3_1(xx) \
-	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_2(xx) \
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-#define KERNEL1x3_4(xx) \
-	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-	addq	$12, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x3_SUB(xx) \
-	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
-	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 2 lines of N
-*******************************************************************************************/
-
-#define KERNEL16x2_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL16x2_2(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL16x2_3(xx) \
-	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-#define KERNEL16x2_4(xx) \
-	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-	addq	$8, BI				  ;\
-	addq	$64, %rax 			  ;\
-
-#define KERNEL16x2_SUB(xx) \
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL8x2_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL8x2_2(xx) \
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL8x2_3(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-#define KERNEL8x2_4(xx) \
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-	addq	$8, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x2_SUB(xx) \
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x2_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL4x2_2(xx) \
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL4x2_3(xx) \
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL4x2_4(xx) \
-	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	addq	$8, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x2_SUB(xx) \
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL2x2_1(xx) \
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-
-#define KERNEL2x2_2(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-
-#define KERNEL2x2_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-
-#define KERNEL2x2_4(xx) \
-	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-	addq	$8, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x2_SUB(xx) \
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x2_1(xx) \
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_2(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-#define KERNEL1x2_4(xx) \
-	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-	addq	$8, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x2_SUB(xx) \
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
-	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 1 line of N
-*******************************************************************************************/
-
-#define KERNEL16x1_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL16x1_2(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL16x1_3(xx) \
-	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-#define KERNEL16x1_4(xx) \
-	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-	addq	$4, BI				  ;\
-	addq	$64, %rax 			  ;\
-
-#define KERNEL16x1_SUB(xx) \
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL8x1_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL8x1_2(xx) \
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL8x1_3(xx) \
-	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-#define KERNEL8x1_4(xx) \
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-	addq	$4, BI				  ;\
-	addq	$32, %rax 			  ;\
-
-#define KERNEL8x1_SUB(xx) \
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
-
-
-/*******************************************************************************************/
-
-#define KERNEL4x1_1(xx) \
-	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL4x1_2(xx) \
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL4x1_3(xx) \
-	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL4x1_4(xx) \
-	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	addq	$4, BI				  ;\
-	addq	$16, %rax 			  ;\
-
-#define KERNEL4x1_SUB(xx) \
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL2x1_1(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-
-#define KERNEL2x1_2(xx) \
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-
-#define KERNEL2x1_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-
-#define KERNEL2x1_4(xx) \
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-	addq	$4, BI				  ;\
-	addq	$8, %rax 			  ;\
-
-#define KERNEL2x1_SUB(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
-
-/*******************************************************************************************/
-
-#define KERNEL1x1_1(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_2(xx) \
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_3(xx) \
-	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-#define KERNEL1x1_4(xx) \
-	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-	addq	$4, BI				  ;\
-	addq	$4, %rax 			  ;\
-
-#define KERNEL1x1_SUB(xx) \
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
-	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
-	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
-
-/*******************************************************************************************/
-
-#if !defined(TRMMKERNEL)
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $6,  %rdi
-        divq    %rdi                    //    N / 6
-        movq    %rax, Ndiv6             //    N / 6
-        movq    %rdx, Nmod6             //    N % 6
-
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L2_0
-	ALIGN_4
-
-.L6_01:
-        // copy to sub buffer
-        movq    K, %rax
-        salq    $1,%rax                 // K * 2 ; read 2 values
-        movq    B, BO1
-        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
-        leaq    BUFFER1, BO             // first buffer to BO
-        movq    K, %rax
-	sarq	$3 , %rax		// K / 8
-	jz	.L6_01a_2
-        ALIGN_4
-
-.L6_01a_1:
-
-        prefetcht0 512(BO1)
-        prefetcht0 512(BO2)
-        prefetchw  512(BO)
-
-	vmovsd	0 * SIZE(BO1), %xmm0
-	vmovsd	2 * SIZE(BO1), %xmm2
-	vmovsd	4 * SIZE(BO1), %xmm4
-	vmovsd	6 * SIZE(BO1), %xmm6
-	vmovss  0 * SIZE(BO2), %xmm1
-	vmovss  2 * SIZE(BO2), %xmm3
-	vmovss  4 * SIZE(BO2), %xmm5
-	vmovss  6 * SIZE(BO2), %xmm7
-	vmovsd	%xmm0, 0*SIZE(BO)
-	vmovss	%xmm1, 2*SIZE(BO)
-	vmovsd	%xmm2, 3*SIZE(BO)
-	vmovss	%xmm3, 5*SIZE(BO)
-	vmovsd	%xmm4, 6*SIZE(BO)
-	vmovss	%xmm5, 8*SIZE(BO)
-	vmovsd	%xmm6, 9*SIZE(BO)
-	vmovss	%xmm7,11*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-
-	vmovsd	0 * SIZE(BO1), %xmm0
-	vmovsd	2 * SIZE(BO1), %xmm2
-	vmovsd	4 * SIZE(BO1), %xmm4
-	vmovsd	6 * SIZE(BO1), %xmm6
-	vmovss  0 * SIZE(BO2), %xmm1
-	vmovss  2 * SIZE(BO2), %xmm3
-	vmovss  4 * SIZE(BO2), %xmm5
-	vmovss  6 * SIZE(BO2), %xmm7
-	vmovsd	%xmm0, 0*SIZE(BO)
-	vmovss	%xmm1, 2*SIZE(BO)
-	vmovsd	%xmm2, 3*SIZE(BO)
-	vmovss	%xmm3, 5*SIZE(BO)
-	vmovsd	%xmm4, 6*SIZE(BO)
-	vmovss	%xmm5, 8*SIZE(BO)
-	vmovsd	%xmm6, 9*SIZE(BO)
-	vmovss	%xmm7,11*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-
-	decq	%rax
-	jnz	.L6_01a_1
-
-
-
-.L6_01a_2:
-
-	movq    K, %rax
-        andq    $7, %rax                // K % 8
-        jz      .L6_02c
-        ALIGN_4
-
-
-.L6_02b:
-
-	vmovsd	0 * SIZE(BO1), %xmm0
-	vmovss  0 * SIZE(BO2), %xmm2
-	vmovsd	%xmm0, 0*SIZE(BO)
-	vmovss	%xmm2, 2*SIZE(BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO2
-	addq	$3*SIZE,BO
-	decq	%rax
-	jnz	.L6_02b
-
-.L6_02c:
-
-	movq	K, %rax
-	salq	$1,%rax			// K * 2
-	leaq	(B,%rax, SIZE), BO1	// next offset to BO1
-	leaq	(BO1,%rax, SIZE), BO2	// next offset to BO2
-	leaq    BUFFER2, BO		// second buffer to BO
-	movq	K, %rax
-	sarq	$3 , %rax		// K / 8
-	jz	.L6_02c_2
-	ALIGN_4
-
-.L6_02c_1:
-
-	prefetcht0 512(BO2)
-        prefetchw  512(BO)
-
-	vmovsd	0 * SIZE(BO2), %xmm0
-	vmovsd	2 * SIZE(BO2), %xmm2
-	vmovsd	4 * SIZE(BO2), %xmm4
-	vmovsd	6 * SIZE(BO2), %xmm6
-	vmovss  1 * SIZE(BO1), %xmm1
-	vmovss  3 * SIZE(BO1), %xmm3
-	vmovss  5 * SIZE(BO1), %xmm5
-	vmovss  7 * SIZE(BO1), %xmm7
-	vmovss	%xmm1, 0*SIZE(BO)
-	vmovsd	%xmm0, 1*SIZE(BO)
-	vmovss	%xmm3, 3*SIZE(BO)
-	vmovsd	%xmm2, 4*SIZE(BO)
-	vmovss	%xmm5, 6*SIZE(BO)
-	vmovsd	%xmm4, 7*SIZE(BO)
-	vmovss	%xmm7, 9*SIZE(BO)
-	vmovsd	%xmm6,10*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-
-
-	vmovsd	0 * SIZE(BO2), %xmm0
-	vmovsd	2 * SIZE(BO2), %xmm2
-	vmovsd	4 * SIZE(BO2), %xmm4
-	vmovsd	6 * SIZE(BO2), %xmm6
-	vmovss  1 * SIZE(BO1), %xmm1
-	vmovss  3 * SIZE(BO1), %xmm3
-	vmovss  5 * SIZE(BO1), %xmm5
-	vmovss  7 * SIZE(BO1), %xmm7
-	vmovss	%xmm1, 0*SIZE(BO)
-	vmovsd	%xmm0, 1*SIZE(BO)
-	vmovss	%xmm3, 3*SIZE(BO)
-	vmovsd	%xmm2, 4*SIZE(BO)
-	vmovss	%xmm5, 6*SIZE(BO)
-	vmovsd	%xmm4, 7*SIZE(BO)
-	vmovss	%xmm7, 9*SIZE(BO)
-	vmovsd	%xmm6,10*SIZE(BO)
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO2
-	addq	$12*SIZE,BO
-
-	decq	%rax
-	jnz	.L6_02c_1
-
-
-.L6_02c_2:
-
-	movq    K, %rax
-        andq    $7, %rax                // K % 8
-        jz      .L6_03c
-        ALIGN_4
-
-.L6_03b:
-
-	vmovss	  1*SIZE(BO1), %xmm0
-	vmovsd	  0*SIZE(BO2), %xmm1
-	vmovss	%xmm0, 0*SIZE(BO)
-	vmovsd	%xmm1, 1*SIZE(BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO2
-	addq	$3*SIZE,BO
-	decq	%rax
-	jnz	.L6_03b
-
-
-.L6_03c:
-
-	movq	BO2, B			// next offset of B
-
-.L6_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		 
-	leaq	(C, LDC, 1), C		// c += 3 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L6_20
-
-	ALIGN_4
-
-.L6_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L6_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	KERNEL16x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	je	.L6_16
-
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	je	.L6_16
-
-	jmp	.L6_12
-	ALIGN_4
-
-.L6_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_17:
-
-	KERNEL16x3_SUB(xxx)
-	addq	$3, BI
-	addq	$16, %rax
-	jl	.L6_17
-	ALIGN_4
-
-
-.L6_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-	vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-	vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
-	vmovups	%xmm14,12 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm12, 8 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm15,12 * SIZE(CO1, LDC, 2)
-
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L6_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L6_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L7_10		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L6_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L6_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L6_20_6
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L6_20_6
-
-	jmp	.L6_20_2
-	ALIGN_4
-
-.L6_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_20_7:
-
-	KERNEL8x3_SUB(xxx)
-	addq	$3, BI
-	addq	$8, %rax
-	jl	.L6_20_7
-	ALIGN_4
-
-
-.L6_20_9:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
-
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L6_21pre:
-
-	testq	$4, M		
-	jz	.L6_30
-	ALIGN_4
-
-.L6_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L6_26
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L6_26
-
-	jmp	.L6_22
-	ALIGN_4
-
-.L6_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_27:
-
-	KERNEL4x3_SUB(xxx)
-	addq	$3, BI
-	addq	$4, %rax
-	jl	.L6_27
-	ALIGN_4
-
-
-.L6_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L6_30:
-	testq	$2, M		
-	jz	.L6_40
-
-	ALIGN_4
-
-.L6_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI,SIZE)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L6_36
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,SIZE)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L6_36
-
-	jmp	.L6_32
-	ALIGN_4
-
-.L6_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_37:
-
-	KERNEL2x3_SUB(xxx)
-	addq	$3, BI
-	addq	$2, %rax
-	jl	.L6_37
-	ALIGN_4
-
-
-.L6_39:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
-	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
-	vmovss	%xmm6 ,  	(CO1, LDC, 2)
-	vmovss	%xmm12, 1 * SIZE(CO1, LDC, 2)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L6_40:
-	testq	$1, M		
-	jz	.L7_10		// to next 3 lines of N
-
-	ALIGN_4
-
-.L6_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L6_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_42:
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L6_46
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L6_46
-
-	jmp	.L6_42
-	ALIGN_4
-
-.L6_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L6_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L6_47:
-
-	KERNEL1x3_SUB(xxx)
-	addq	$3, BI
-	addq	$1, %rax
-	jl	.L6_47
-	ALIGN_4
-
-
-.L6_49:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm6 ,  	(CO1, LDC, 2)
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-/***************************************************************************************************************/
-
-.L7_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		 
-	leaq	(C, LDC, 1), C		// c += 3 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L7_20
-
-	ALIGN_4
-
-.L7_11:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L7_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	KERNEL16x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	je	.L7_16
-
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	KERNEL16x3_1(xxx)
-	KERNEL16x3_2(xxx)
-	KERNEL16x3_3(xxx)
-	KERNEL16x3_4(xxx)
-
-	je	.L7_16
-
-	jmp	.L7_12
-	ALIGN_4
-
-.L7_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_17:
-
-	KERNEL16x3_SUB(xxx)
-	addq	$3, BI
-	addq	$16, %rax
-	jl	.L7_17
-	ALIGN_4
-
-
-.L7_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-	vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-	vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
-
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
-	vmovups	%xmm14,12 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm12, 8 * SIZE(CO1, LDC, 2)
-	vmovups	%xmm15,12 * SIZE(CO1, LDC, 2)
-
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L7_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L7_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L7_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L7_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L7_20_1:
-        leaq    BUFFER2, BO             // first buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L7_20_6
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	KERNEL8x3_1(xxx)
-	KERNEL8x3_2(xxx)
-	KERNEL8x3_3(xxx)
-	KERNEL8x3_4(xxx)
-
-	je	.L7_20_6
-
-	jmp	.L7_20_2
-	ALIGN_4
-
-.L7_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_20_7:
-
-	KERNEL8x3_SUB(xxx)
-	addq	$3, BI
-	addq	$8, %rax
-	jl	.L7_20_7
-	ALIGN_4
-
-.L7_20_9:
-
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
-
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L7_21pre:
-
-	testq	$4, M		
-	jz	.L7_30
-	ALIGN_4
-
-.L7_21:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI, SIZE)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L7_26
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI, SIZE)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	KERNEL4x3_1(xxx)
-	KERNEL4x3_2(xxx)
-	KERNEL4x3_3(xxx)
-	KERNEL4x3_4(xxx)
-
-	je	.L7_26
-
-	jmp	.L7_22
-	ALIGN_4
-
-.L7_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_27:
-
-	KERNEL4x3_SUB(xxx)
-	addq	$3, BI
-	addq	$4, %rax
-	jl	.L7_27
-	ALIGN_4
-
-
-.L7_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm6 ,  	(CO1, LDC, 2)
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L7_30:
-	testq	$2, M		
-	jz	.L7_40
-
-	ALIGN_4
-
-.L7_31:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	prefetcht0	B_PR1+16(BO,BI,SIZE)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L7_36
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	prefetcht0	B_PR1+32(BO,BI,SIZE)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	KERNEL2x3_1(xxx)
-	KERNEL2x3_2(xxx)
-	KERNEL2x3_3(xxx)
-	KERNEL2x3_4(xxx)
-
-	je	.L7_36
-
-	jmp	.L7_32
-	ALIGN_4
-
-.L7_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_37:
-
-	KERNEL2x3_SUB(xxx)
-	addq	$3, BI
-	addq	$2, %rax
-	jl	.L7_37
-	ALIGN_4
-
-
-.L7_39:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
-	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-	vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
-	vmovss	%xmm6 ,  	(CO1, LDC, 2)
-	vmovss	%xmm12, 1 * SIZE(CO1, LDC, 2)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L7_40:
-	testq	$1, M		
-	jz	.L7_60		// to next 3 lines of N
-
-	ALIGN_4
-
-.L7_41:
-        leaq    BUFFER2, BO             // second buffer to BO
-        addq    $6 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L7_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_42:
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L7_46
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	KERNEL1x3_1(xxx)
-	KERNEL1x3_2(xxx)
-	KERNEL1x3_3(xxx)
-	KERNEL1x3_4(xxx)
-
-	je	.L7_46
-
-	jmp	.L7_42
-	ALIGN_4
-
-.L7_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L7_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L7_47:
-
-	KERNEL1x3_SUB(xxx)
-	addq	$3, BI
-	addq	$1, %rax
-	jl	.L7_47
-	ALIGN_4
-
-
-.L7_49:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm6 ,  	(CO1, LDC, 2)
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-.L7_60:
-
-	decq	J			// j --
-	jg	.L6_01
-
-
-.L2_0:
-	cmpq	$0, Nmod6		// N % 6 == 0
-	je	.L999
-
-/************************************************************************************************
-* Loop for Nmod6 / 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	sarq	$1, J			// j = j / 2
-	je	.L1_0
-	ALIGN_4
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovsd	(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB(xxx)
-	addq	$2, BI
-	addq	$16, %rax
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
-	vmovups	%xmm14,12 * SIZE(CO1, LDC)
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_20_6
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB(xxx)
-	addq	$2, BI
-	addq	$8, %rax
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB(xxx)
-	addq	$2, BI
-	addq	$4, %rax
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB(xxx)
-	addq	$2, BI
-	addq	$2, %rax
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	addq	$2, BI
-	addq	$1, %rax
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-.L2_60:
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovss	(BO1), %xmm0
-	vmovss	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	je	.L1_16
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB(xxx)
-	addq	$1, BI
-	addq	$16, %rax
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_20_6
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB(xxx)
-	addq	$1, BI
-	addq	$8, %rax
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB(xxx)
-	addq	$1, BI
-	addq	$4, %rax
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-
-	vmovups	%xmm4 ,  	(CO1)
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB(xxx)
-	addq	$1, BI
-	addq	$2, %rax
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-
-	vzeroall
-
-        movq    K, %rax
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-        movq    K, %rax
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	addq	$1, BI
-	addq	$1, %rax
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovss	ALPHA, %xmm0
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-
-	vmovss	%xmm4 ,  	(CO1)
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-#else
-/*************************************************************************************
-* TRMM Kernel
-*************************************************************************************/
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	movsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovsd	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $2,  %rdi
-        divq    %rdi                    //    N / 6
-        movq    %rax, Ndiv6             //    N / 6
-        movq    %rdx, Nmod6             //    N % 6
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L1_0
-	ALIGN_4
-
-.L2_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L2_02b:
-
-	vmovsd	(BO1), %xmm0
-	vmovsd	%xmm0,       (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02b
-
-.L2_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $2, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	je	.L2_16
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	KERNEL16x2_1(xxx)
-	KERNEL16x2_2(xxx)
-	KERNEL16x2_3(xxx)
-	KERNEL16x2_4(xxx)
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB(xxx)
-	addq	$2, BI
-	addq	$16, %rax
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
-	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-	vmulps	%xmm0, %xmm7,%xmm7
-	vmulps	%xmm0, %xmm10,%xmm10
-	vmulps	%xmm0, %xmm13,%xmm13
-
-	vmulps	%xmm0, %xmm5,%xmm5
-	vmulps	%xmm0, %xmm8,%xmm8
-	vmulps	%xmm0, %xmm11,%xmm11
-	vmulps	%xmm0, %xmm14,%xmm14
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
-	vmovups	%xmm14,12 * SIZE(CO1, LDC)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_20_6
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	KERNEL8x2_1(xxx)
-	KERNEL8x2_2(xxx)
-	KERNEL8x2_3(xxx)
-	KERNEL8x2_4(xxx)
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB(xxx)
-	addq	$2, BI
-	addq	$8, %rax
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-	vmulps	%xmm0, %xmm7,%xmm7
-
-	vmulps	%xmm0, %xmm5,%xmm5
-	vmulps	%xmm0, %xmm8,%xmm8
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-	vmovups	%xmm5 ,  	(CO1, LDC)
-	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	KERNEL4x2_1(xxx)
-	KERNEL4x2_2(xxx)
-	KERNEL4x2_3(xxx)
-	KERNEL4x2_4(xxx)
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB(xxx)
-	addq	$2, BI
-	addq	$4, %rax
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-	vmulps	%xmm0, %xmm5,%xmm5
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm5 ,  	(CO1, LDC)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	KERNEL2x2_1(xxx)
-	KERNEL2x2_2(xxx)
-	KERNEL2x2_3(xxx)
-	KERNEL2x2_4(xxx)
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB(xxx)
-	addq	$2, BI
-	addq	$2, %rax
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	vmovss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
-
-#else
-	vmulss	%xmm0, %xmm4,%xmm4
-	vmulss	%xmm0, %xmm8,%xmm8
-	vmulss	%xmm0, %xmm5,%xmm5
-	vmulss	%xmm0, %xmm10,%xmm10
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	KERNEL1x2_1(xxx)
-	KERNEL1x2_2(xxx)
-	KERNEL1x2_3(xxx)
-	KERNEL1x2_4(xxx)
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB(xxx)
-	addq	$2, BI
-	addq	$1, %rax
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	vmovss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
-
-#else
-	vmulss	%xmm0, %xmm4,%xmm4
-	vmulss	%xmm0, %xmm5,%xmm5
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 ,  	(CO1, LDC)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-	decq	J			// j --
-	jg	.L2_01			// next 2 lines of N
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovss	(BO1), %xmm0
-	vmovss	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$32 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $1, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	je	.L1_16
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	KERNEL16x1_1(xxx)
-	KERNEL16x1_2(xxx)
-	KERNEL16x1_3(xxx)
-	KERNEL16x1_4(xxx)
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB(xxx)
-	addq	$1, BI
-	addq	$16, %rax
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
-	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-	vmulps	%xmm0, %xmm7,%xmm7
-	vmulps	%xmm0, %xmm10,%xmm10
-	vmulps	%xmm0, %xmm13,%xmm13
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-	vmovups	%xmm10, 8 * SIZE(CO1)
-	vmovups	%xmm13,12 * SIZE(CO1)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_20_6
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	KERNEL8x1_1(xxx)
-	KERNEL8x1_2(xxx)
-	KERNEL8x1_3(xxx)
-	KERNEL8x1_4(xxx)
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB(xxx)
-	addq	$1, BI
-	addq	$8, %rax
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-	vmulps	%xmm0, %xmm7,%xmm7
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm7 , 4 * SIZE(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	prefetcht0	B_PR1(BO,BI, SIZE)
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	KERNEL4x1_1(xxx)
-	KERNEL4x1_2(xxx)
-	KERNEL4x1_3(xxx)
-	KERNEL4x1_4(xxx)
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB(xxx)
-	addq	$1, BI
-	addq	$4, %rax
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	vbroadcastss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
-
-#else
-	vmulps	%xmm0, %xmm4,%xmm4
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	prefetcht0	B_PR1(BO,BI,SIZE)
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	KERNEL2x1_1(xxx)
-	KERNEL2x1_2(xxx)
-	KERNEL2x1_3(xxx)
-	KERNEL2x1_4(xxx)
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB(xxx)
-	addq	$1, BI
-	addq	$2, %rax
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	vmovss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
-
-#else
-	vmulss	%xmm0, %xmm4,%xmm4
-	vmulss	%xmm0, %xmm8,%xmm8
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm8 , 1 * SIZE(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $2 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	KERNEL1x1_1(xxx)
-	KERNEL1x1_2(xxx)
-	KERNEL1x1_3(xxx)
-	KERNEL1x1_4(xxx)
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB(xxx)
-	addq	$1, BI
-	addq	$1, %rax
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	vmovss	ALPHA, %xmm0
-
-#ifndef TRMMKERNEL
-
-	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
-
-#else
-	vmulss	%xmm0, %xmm4,%xmm4
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-
-
-
-#endif
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/*********************************************************************
+*
+* 2013/10/18 Saar
+*        BLASTEST               : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2013/10/29 Saar
+*
+* Parameter:
+*       UNROLL_M        16
+*       UNROLL_N        2
+*       SGEMM_P         768
+*       SGEMM_Q         192
+*       SGEMM_R         12288
+*       A_PR1           384
+*       B_PR1           192
+*
+* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
+* 
+* 6144x6144    168.2    GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 )
+* 6144x6144    162.7    GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 )
+* 6144x6144     82.0    GFLOPS with 2 threads on 2 modules (ACML:  81.4 ) (BULLDOZER:  80.3 )
+* 6144x6144     41.3    GFLOPS with 1 threads on 1 modules (ACML:  41.1 ) (BULLDOZER:  40.4 )
+*
+* Performance at m x n on AMD 6380  (ACML-Version: 5.3.1):
+* 
+* 12288x12288  469.5    GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 )
+* 12288x12288  442.9    GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 )
+* 12288x12288  265.1    GFLOPS with  8 threads on  8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 )
+* 6144x6144    139.7    GFLOPS with  4 threads on  4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 )
+* 6144x6144     70.9    GFLOPS with  2 threads on  2 modules (ACML:  67.4 ) (BULLDOZER:  69.5 )
+* 6144x6144     35.6    GFLOPS with  1 threads on  1 modules (ACML:  36.1 ) (BULLDOZER:  35.1 )
+*
+*********************************************************************/
+
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+#define LB2_OFFSET    4096
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+#define BUFFER2	LB2_OFFSET+128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+
+
+#define	A_PR1	384
+#define	B_PR1	192
+
+/*******************************************************************************************
+* 3 lines of N
+*******************************************************************************************/
+
+#define KERNEL16x3_1(xx) \
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL16x3_2(xx) \
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL16x3_3(xx) \
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL16x3_4(xx) \
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	addq	$12, BI				  ;\
+	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	addq	$64, %rax 			  ;\
+	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+#define KERNEL16x3_SUB(xx) \
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vfmaddps  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	vfmaddps  	%xmm15,%xmm3,%xmm0,%xmm15 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL8x3_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL8x3_2(xx) \
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL8x3_3(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+#define KERNEL8x3_4(xx) \
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+	addq	$12, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x3_SUB(xx) \
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	nop						;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vfmaddps  	%xmm9,%xmm3,%xmm0,%xmm9 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x3_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL4x3_2(xx) \
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL4x3_3(xx) \
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL4x3_4(xx) \
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	addq	$12, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x3_SUB(xx) \
+	vbroadcastss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddps  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL2x3_1(xx) \
+	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+
+#define KERNEL2x3_2(xx) \
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+
+#define KERNEL2x3_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+
+#define KERNEL2x3_4(xx) \
+	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+	addq	$12, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x3_SUB(xx) \
+	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	vfmaddss  	%xmm12,%xmm3,%xmm0,%xmm12 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x3_1(xx) \
+	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_2(xx) \
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+#define KERNEL1x3_4(xx) \
+	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  4 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	  5 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+	addq	$12, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x3_SUB(xx) \
+	vmovss	 -6 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -5 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm3 ;\
+	vfmaddss  	%xmm6,%xmm3,%xmm0,%xmm6 ;\
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 2 lines of N
+*******************************************************************************************/
+
+#define KERNEL16x2_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL16x2_2(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL16x2_3(xx) \
+	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+#define KERNEL16x2_4(xx) \
+	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+	addq	$8, BI				  ;\
+	addq	$64, %rax 			  ;\
+
+#define KERNEL16x2_SUB(xx) \
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vfmaddps  	%xmm11,%xmm2,%xmm0,%xmm11 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	vfmaddps  	%xmm14,%xmm2,%xmm0,%xmm14 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL8x2_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL8x2_2(xx) \
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL8x2_3(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+#define KERNEL8x2_4(xx) \
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+	addq	$8, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x2_SUB(xx) \
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vfmaddps  	%xmm8,%xmm2,%xmm0,%xmm8 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x2_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL4x2_2(xx) \
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL4x2_3(xx) \
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL4x2_4(xx) \
+	vbroadcastss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	addq	$8, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x2_SUB(xx) \
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddps  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL2x2_1(xx) \
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+
+#define KERNEL2x2_2(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+
+#define KERNEL2x2_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+
+#define KERNEL2x2_4(xx) \
+	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+	addq	$8, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x2_SUB(xx) \
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	vfmaddss  	%xmm10,%xmm2,%xmm0,%xmm10 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x2_1(xx) \
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_2(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+#define KERNEL1x2_4(xx) \
+	vmovss	  2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	  3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+	addq	$8, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x2_SUB(xx) \
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm2 ;\
+	vfmaddss  	%xmm5,%xmm2,%xmm0,%xmm5 ;\
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 1 line of N
+*******************************************************************************************/
+
+#define KERNEL16x1_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL16x1_2(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL16x1_3(xx) \
+	prefetcht0	A_PR1+128(AO,%rax,SIZE)	;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	  0 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	  4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	  8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+#define KERNEL16x1_4(xx) \
+	prefetcht0	A_PR1+192(AO,%rax,SIZE)	;\
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+	addq	$4, BI				  ;\
+	addq	$64, %rax 			  ;\
+
+#define KERNEL16x1_SUB(xx) \
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm10,%xmm1,%xmm0,%xmm10 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm13,%xmm1,%xmm0,%xmm13 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL8x1_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL8x1_2(xx) \
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL8x1_3(xx) \
+	prefetcht0	A_PR1+64(AO,%rax,SIZE)	;\
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-12 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+#define KERNEL8x1_4(xx) \
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+	addq	$4, BI				  ;\
+	addq	$32, %rax 			  ;\
+
+#define KERNEL8x1_SUB(xx) \
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm7,%xmm1,%xmm0,%xmm7 ;\
+
+
+/*******************************************************************************************/
+
+#define KERNEL4x1_1(xx) \
+	prefetcht0	A_PR1(AO,%rax,SIZE)	;\
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL4x1_2(xx) \
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL4x1_3(xx) \
+	vbroadcastss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-24 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL4x1_4(xx) \
+	vbroadcastss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-20 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	addq	$4, BI				  ;\
+	addq	$16, %rax 			  ;\
+
+#define KERNEL4x1_SUB(xx) \
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovups 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddps  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL2x1_1(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+
+#define KERNEL2x1_2(xx) \
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+
+#define KERNEL2x1_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-28 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss 	-27 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+
+#define KERNEL2x1_4(xx) \
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-26 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss 	-25 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+	addq	$4, BI				  ;\
+	addq	$8, %rax 			  ;\
+
+#define KERNEL2x1_SUB(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm8,%xmm1,%xmm0,%xmm8 ;\
+
+/*******************************************************************************************/
+
+#define KERNEL1x1_1(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_2(xx) \
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-31 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_3(xx) \
+	vmovss	  0 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-30 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+#define KERNEL1x1_4(xx) \
+	vmovss	  1 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-29 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+	addq	$4, BI				  ;\
+	addq	$4, %rax 			  ;\
+
+#define KERNEL1x1_SUB(xx) \
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm1 ;\
+	vmovss 	-32 * SIZE(AO, %rax, SIZE), %xmm0 ;\
+	vfmaddss  	%xmm4,%xmm1,%xmm0,%xmm4 ;\
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $6,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L2_0
+	ALIGN_4
+
+.L6_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $1,%rax                 // K * 2 ; read 2 values
+        movq    B, BO1
+        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+	sarq	$3 , %rax		// K / 8
+	jz	.L6_01a_2
+        ALIGN_4
+
+.L6_01a_1:
+
+        prefetcht0 512(BO1)
+        prefetcht0 512(BO2)
+        prefetchw  512(BO)
+
+	vmovsd	0 * SIZE(BO1), %xmm0
+	vmovsd	2 * SIZE(BO1), %xmm2
+	vmovsd	4 * SIZE(BO1), %xmm4
+	vmovsd	6 * SIZE(BO1), %xmm6
+	vmovss  0 * SIZE(BO2), %xmm1
+	vmovss  2 * SIZE(BO2), %xmm3
+	vmovss  4 * SIZE(BO2), %xmm5
+	vmovss  6 * SIZE(BO2), %xmm7
+	vmovsd	%xmm0, 0*SIZE(BO)
+	vmovss	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 3*SIZE(BO)
+	vmovss	%xmm3, 5*SIZE(BO)
+	vmovsd	%xmm4, 6*SIZE(BO)
+	vmovss	%xmm5, 8*SIZE(BO)
+	vmovsd	%xmm6, 9*SIZE(BO)
+	vmovss	%xmm7,11*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+	vmovsd	0 * SIZE(BO1), %xmm0
+	vmovsd	2 * SIZE(BO1), %xmm2
+	vmovsd	4 * SIZE(BO1), %xmm4
+	vmovsd	6 * SIZE(BO1), %xmm6
+	vmovss  0 * SIZE(BO2), %xmm1
+	vmovss  2 * SIZE(BO2), %xmm3
+	vmovss  4 * SIZE(BO2), %xmm5
+	vmovss  6 * SIZE(BO2), %xmm7
+	vmovsd	%xmm0, 0*SIZE(BO)
+	vmovss	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 3*SIZE(BO)
+	vmovss	%xmm3, 5*SIZE(BO)
+	vmovsd	%xmm4, 6*SIZE(BO)
+	vmovss	%xmm5, 8*SIZE(BO)
+	vmovsd	%xmm6, 9*SIZE(BO)
+	vmovss	%xmm7,11*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+	decq	%rax
+	jnz	.L6_01a_1
+
+
+
+.L6_01a_2:
+
+	movq    K, %rax
+        andq    $7, %rax                // K % 8
+        jz      .L6_02c
+        ALIGN_4
+
+
+.L6_02b:
+
+	vmovsd	0 * SIZE(BO1), %xmm0
+	vmovss  0 * SIZE(BO2), %xmm2
+	vmovsd	%xmm0, 0*SIZE(BO)
+	vmovss	%xmm2, 2*SIZE(BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO2
+	addq	$3*SIZE,BO
+	decq	%rax
+	jnz	.L6_02b
+
+.L6_02c:
+
+	movq	K, %rax
+	salq	$1,%rax			// K * 2
+	leaq	(B,%rax, SIZE), BO1	// next offset to BO1
+	leaq	(BO1,%rax, SIZE), BO2	// next offset to BO2
+	leaq    BUFFER2, BO		// second buffer to BO
+	movq	K, %rax
+	sarq	$3 , %rax		// K / 8
+	jz	.L6_02c_2
+	ALIGN_4
+
+.L6_02c_1:
+
+	prefetcht0 512(BO2)
+        prefetchw  512(BO)
+
+	vmovsd	0 * SIZE(BO2), %xmm0
+	vmovsd	2 * SIZE(BO2), %xmm2
+	vmovsd	4 * SIZE(BO2), %xmm4
+	vmovsd	6 * SIZE(BO2), %xmm6
+	vmovss  1 * SIZE(BO1), %xmm1
+	vmovss  3 * SIZE(BO1), %xmm3
+	vmovss  5 * SIZE(BO1), %xmm5
+	vmovss  7 * SIZE(BO1), %xmm7
+	vmovss	%xmm1, 0*SIZE(BO)
+	vmovsd	%xmm0, 1*SIZE(BO)
+	vmovss	%xmm3, 3*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovss	%xmm5, 6*SIZE(BO)
+	vmovsd	%xmm4, 7*SIZE(BO)
+	vmovss	%xmm7, 9*SIZE(BO)
+	vmovsd	%xmm6,10*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+
+	vmovsd	0 * SIZE(BO2), %xmm0
+	vmovsd	2 * SIZE(BO2), %xmm2
+	vmovsd	4 * SIZE(BO2), %xmm4
+	vmovsd	6 * SIZE(BO2), %xmm6
+	vmovss  1 * SIZE(BO1), %xmm1
+	vmovss  3 * SIZE(BO1), %xmm3
+	vmovss  5 * SIZE(BO1), %xmm5
+	vmovss  7 * SIZE(BO1), %xmm7
+	vmovss	%xmm1, 0*SIZE(BO)
+	vmovsd	%xmm0, 1*SIZE(BO)
+	vmovss	%xmm3, 3*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovss	%xmm5, 6*SIZE(BO)
+	vmovsd	%xmm4, 7*SIZE(BO)
+	vmovss	%xmm7, 9*SIZE(BO)
+	vmovsd	%xmm6,10*SIZE(BO)
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO2
+	addq	$12*SIZE,BO
+
+	decq	%rax
+	jnz	.L6_02c_1
+
+
+.L6_02c_2:
+
+	movq    K, %rax
+        andq    $7, %rax                // K % 8
+        jz      .L6_03c
+        ALIGN_4
+
+.L6_03b:
+
+	vmovss	  1*SIZE(BO1), %xmm0
+	vmovsd	  0*SIZE(BO2), %xmm1
+	vmovss	%xmm0, 0*SIZE(BO)
+	vmovsd	%xmm1, 1*SIZE(BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO2
+	addq	$3*SIZE,BO
+	decq	%rax
+	jnz	.L6_03b
+
+
+.L6_03c:
+
+	movq	BO2, B			// next offset of B
+
+.L6_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L6_20
+
+	ALIGN_4
+
+.L6_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L6_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	KERNEL16x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	je	.L6_16
+
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	je	.L6_16
+
+	jmp	.L6_12
+	ALIGN_4
+
+.L6_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_17:
+
+	KERNEL16x3_SUB(xxx)
+	addq	$3, BI
+	addq	$16, %rax
+	jl	.L6_17
+	ALIGN_4
+
+
+.L6_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+	vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+	vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
+	vmovups	%xmm14,12 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm12, 8 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm15,12 * SIZE(CO1, LDC, 2)
+
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L6_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L6_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_10		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L6_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L6_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L6_20_6
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L6_20_6
+
+	jmp	.L6_20_2
+	ALIGN_4
+
+.L6_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_20_7:
+
+	KERNEL8x3_SUB(xxx)
+	addq	$3, BI
+	addq	$8, %rax
+	jl	.L6_20_7
+	ALIGN_4
+
+
+.L6_20_9:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
+
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L6_21pre:
+
+	testq	$4, M		
+	jz	.L6_30
+	ALIGN_4
+
+.L6_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L6_26
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L6_26
+
+	jmp	.L6_22
+	ALIGN_4
+
+.L6_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_27:
+
+	KERNEL4x3_SUB(xxx)
+	addq	$3, BI
+	addq	$4, %rax
+	jl	.L6_27
+	ALIGN_4
+
+
+.L6_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L6_30:
+	testq	$2, M		
+	jz	.L6_40
+
+	ALIGN_4
+
+.L6_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI,SIZE)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L6_36
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,SIZE)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L6_36
+
+	jmp	.L6_32
+	ALIGN_4
+
+.L6_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_37:
+
+	KERNEL2x3_SUB(xxx)
+	addq	$3, BI
+	addq	$2, %rax
+	jl	.L6_37
+	ALIGN_4
+
+
+.L6_39:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
+	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
+	vmovss	%xmm6 ,  	(CO1, LDC, 2)
+	vmovss	%xmm12, 1 * SIZE(CO1, LDC, 2)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L6_40:
+	testq	$1, M		
+	jz	.L7_10		// to next 3 lines of N
+
+	ALIGN_4
+
+.L6_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L6_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_42:
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L6_46
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L6_46
+
+	jmp	.L6_42
+	ALIGN_4
+
+.L6_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L6_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L6_47:
+
+	KERNEL1x3_SUB(xxx)
+	addq	$3, BI
+	addq	$1, %rax
+	jl	.L6_47
+	ALIGN_4
+
+
+.L6_49:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm6 ,  	(CO1, LDC, 2)
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+/***************************************************************************************************************/
+
+.L7_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		 
+	leaq	(C, LDC, 1), C		// c += 3 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L7_20
+
+	ALIGN_4
+
+.L7_11:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L7_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	KERNEL16x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	je	.L7_16
+
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	KERNEL16x3_1(xxx)
+	KERNEL16x3_2(xxx)
+	KERNEL16x3_3(xxx)
+	KERNEL16x3_4(xxx)
+
+	je	.L7_16
+
+	jmp	.L7_12
+	ALIGN_4
+
+.L7_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_17:
+
+	KERNEL16x3_SUB(xxx)
+	addq	$3, BI
+	addq	$16, %rax
+	jl	.L7_17
+	ALIGN_4
+
+
+.L7_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+	vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+	vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
+
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
+	vmovups	%xmm14,12 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm12, 8 * SIZE(CO1, LDC, 2)
+	vmovups	%xmm15,12 * SIZE(CO1, LDC, 2)
+
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L7_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L7_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L7_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L7_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L7_20_1:
+        leaq    BUFFER2, BO             // first buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L7_20_6
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	KERNEL8x3_1(xxx)
+	KERNEL8x3_2(xxx)
+	KERNEL8x3_3(xxx)
+	KERNEL8x3_4(xxx)
+
+	je	.L7_20_6
+
+	jmp	.L7_20_2
+	ALIGN_4
+
+.L7_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_20_7:
+
+	KERNEL8x3_SUB(xxx)
+	addq	$3, BI
+	addq	$8, %rax
+	jl	.L7_20_7
+	ALIGN_4
+
+.L7_20_9:
+
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+	vmovups	%xmm9 , 4 * SIZE(CO1, LDC, 2)
+
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L7_21pre:
+
+	testq	$4, M		
+	jz	.L7_30
+	ALIGN_4
+
+.L7_21:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI, SIZE)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L7_26
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI, SIZE)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	KERNEL4x3_1(xxx)
+	KERNEL4x3_2(xxx)
+	KERNEL4x3_3(xxx)
+	KERNEL4x3_4(xxx)
+
+	je	.L7_26
+
+	jmp	.L7_22
+	ALIGN_4
+
+.L7_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_27:
+
+	KERNEL4x3_SUB(xxx)
+	addq	$3, BI
+	addq	$4, %rax
+	jl	.L7_27
+	ALIGN_4
+
+
+.L7_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 	 (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(CO1, LDC, 2)
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L7_30:
+	testq	$2, M		
+	jz	.L7_40
+
+	ALIGN_4
+
+.L7_31:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	prefetcht0	B_PR1+16(BO,BI,SIZE)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L7_36
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	prefetcht0	B_PR1+32(BO,BI,SIZE)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	KERNEL2x3_1(xxx)
+	KERNEL2x3_2(xxx)
+	KERNEL2x3_3(xxx)
+	KERNEL2x3_4(xxx)
+
+	je	.L7_36
+
+	jmp	.L7_32
+	ALIGN_4
+
+.L7_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_37:
+
+	KERNEL2x3_SUB(xxx)
+	addq	$3, BI
+	addq	$2, %rax
+	jl	.L7_37
+	ALIGN_4
+
+
+.L7_39:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
+	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+	vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
+	vmovss	%xmm6 ,  	(CO1, LDC, 2)
+	vmovss	%xmm12, 1 * SIZE(CO1, LDC, 2)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L7_40:
+	testq	$1, M		
+	jz	.L7_60		// to next 3 lines of N
+
+	ALIGN_4
+
+.L7_41:
+        leaq    BUFFER2, BO             // second buffer to BO
+        addq    $6 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L7_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_42:
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L7_46
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	KERNEL1x3_1(xxx)
+	KERNEL1x3_2(xxx)
+	KERNEL1x3_3(xxx)
+	KERNEL1x3_4(xxx)
+
+	je	.L7_46
+
+	jmp	.L7_42
+	ALIGN_4
+
+.L7_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L7_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,2), BI                   //  BI = BI * 3 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L7_47:
+
+	KERNEL1x3_SUB(xxx)
+	addq	$3, BI
+	addq	$1, %rax
+	jl	.L7_47
+	ALIGN_4
+
+
+.L7_49:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 	 (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm6 ,  	(CO1, LDC, 2)
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+.L7_60:
+
+	decq	J			// j --
+	jg	.L6_01
+
+
+.L2_0:
+	cmpq	$0, Nmod6		// N % 6 == 0
+	je	.L999
+
+/************************************************************************************************
+* Loop for Nmod6 / 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	sarq	$1, J			// j = j / 2
+	je	.L1_0
+	ALIGN_4
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB(xxx)
+	addq	$2, BI
+	addq	$16, %rax
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
+	vmovups	%xmm14,12 * SIZE(CO1, LDC)
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_20_6
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB(xxx)
+	addq	$2, BI
+	addq	$8, %rax
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB(xxx)
+	addq	$2, BI
+	addq	$4, %rax
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB(xxx)
+	addq	$2, BI
+	addq	$2, %rax
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	addq	$2, BI
+	addq	$1, %rax
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+.L2_60:
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	je	.L1_16
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB(xxx)
+	addq	$1, BI
+	addq	$16, %rax
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_20_6
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB(xxx)
+	addq	$1, BI
+	addq	$8, %rax
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB(xxx)
+	addq	$1, BI
+	addq	$4, %rax
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+
+	vmovups	%xmm4 ,  	(CO1)
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB(xxx)
+	addq	$1, BI
+	addq	$2, %rax
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+
+	vzeroall
+
+        movq    K, %rax
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	addq	$1, BI
+	addq	$1, %rax
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovss	ALPHA, %xmm0
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+
+	vmovss	%xmm4 ,  	(CO1)
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	movsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $2,  %rdi
+        divq    %rdi                    //    N / 6
+        movq    %rax, Ndiv6             //    N / 6
+        movq    %rdx, Nmod6             //    N % 6
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L1_0
+	ALIGN_4
+
+.L2_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L2_02b:
+
+	vmovsd	(BO1), %xmm0
+	vmovsd	%xmm0,       (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02b
+
+.L2_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	je	.L2_16
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	KERNEL16x2_1(xxx)
+	KERNEL16x2_2(xxx)
+	KERNEL16x2_3(xxx)
+	KERNEL16x2_4(xxx)
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB(xxx)
+	addq	$2, BI
+	addq	$16, %rax
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+	vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
+	vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+	vmulps	%xmm0, %xmm7,%xmm7
+	vmulps	%xmm0, %xmm10,%xmm10
+	vmulps	%xmm0, %xmm13,%xmm13
+
+	vmulps	%xmm0, %xmm5,%xmm5
+	vmulps	%xmm0, %xmm8,%xmm8
+	vmulps	%xmm0, %xmm11,%xmm11
+	vmulps	%xmm0, %xmm14,%xmm14
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+	vmovups	%xmm11, 8 * SIZE(CO1, LDC)
+	vmovups	%xmm14,12 * SIZE(CO1, LDC)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_20_6
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	KERNEL8x2_1(xxx)
+	KERNEL8x2_2(xxx)
+	KERNEL8x2_3(xxx)
+	KERNEL8x2_4(xxx)
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB(xxx)
+	addq	$2, BI
+	addq	$8, %rax
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+	vmulps	%xmm0, %xmm7,%xmm7
+
+	vmulps	%xmm0, %xmm5,%xmm5
+	vmulps	%xmm0, %xmm8,%xmm8
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm8 , 4 * SIZE(CO1, LDC)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	KERNEL4x2_1(xxx)
+	KERNEL4x2_2(xxx)
+	KERNEL4x2_3(xxx)
+	KERNEL4x2_4(xxx)
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB(xxx)
+	addq	$2, BI
+	addq	$4, %rax
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+	vmulps	%xmm0, %xmm5,%xmm5
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	KERNEL2x2_1(xxx)
+	KERNEL2x2_2(xxx)
+	KERNEL2x2_3(xxx)
+	KERNEL2x2_4(xxx)
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB(xxx)
+	addq	$2, BI
+	addq	$2, %rax
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	vmovss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+	vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10
+
+#else
+	vmulss	%xmm0, %xmm4,%xmm4
+	vmulss	%xmm0, %xmm8,%xmm8
+	vmulss	%xmm0, %xmm5,%xmm5
+	vmulss	%xmm0, %xmm10,%xmm10
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+	vmovss	%xmm10, 1 * SIZE(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	KERNEL1x2_1(xxx)
+	KERNEL1x2_2(xxx)
+	KERNEL1x2_3(xxx)
+	KERNEL1x2_4(xxx)
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB(xxx)
+	addq	$2, BI
+	addq	$1, %rax
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	vmovss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 	 (CO1, LDC),%xmm0, %xmm5,%xmm5
+
+#else
+	vmulss	%xmm0, %xmm4,%xmm4
+	vmulss	%xmm0, %xmm5,%xmm5
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 ,  	(CO1, LDC)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+	decq	J			// j --
+	jg	.L2_01			// next 2 lines of N
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$32 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	je	.L1_16
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	KERNEL16x1_1(xxx)
+	KERNEL16x1_2(xxx)
+	KERNEL16x1_3(xxx)
+	KERNEL16x1_4(xxx)
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB(xxx)
+	addq	$1, BI
+	addq	$16, %rax
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+	vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10
+	vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+	vmulps	%xmm0, %xmm7,%xmm7
+	vmulps	%xmm0, %xmm10,%xmm10
+	vmulps	%xmm0, %xmm13,%xmm13
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+	vmovups	%xmm10, 8 * SIZE(CO1)
+	vmovups	%xmm13,12 * SIZE(CO1)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_20_6
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	KERNEL8x1_1(xxx)
+	KERNEL8x1_2(xxx)
+	KERNEL8x1_3(xxx)
+	KERNEL8x1_4(xxx)
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB(xxx)
+	addq	$1, BI
+	addq	$8, %rax
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+	vmulps	%xmm0, %xmm7,%xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm7 , 4 * SIZE(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	prefetcht0	B_PR1(BO,BI, SIZE)
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	KERNEL4x1_1(xxx)
+	KERNEL4x1_2(xxx)
+	KERNEL4x1_3(xxx)
+	KERNEL4x1_4(xxx)
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB(xxx)
+	addq	$1, BI
+	addq	$4, %rax
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	vbroadcastss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddps 	 (CO1),%xmm0, %xmm4,%xmm4
+
+#else
+	vmulps	%xmm0, %xmm4,%xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	prefetcht0	B_PR1(BO,BI,SIZE)
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	KERNEL2x1_1(xxx)
+	KERNEL2x1_2(xxx)
+	KERNEL2x1_3(xxx)
+	KERNEL2x1_4(xxx)
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB(xxx)
+	addq	$1, BI
+	addq	$2, %rax
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	vmovss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+	vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8
+
+#else
+	vmulss	%xmm0, %xmm4,%xmm4
+	vmulss	%xmm0, %xmm8,%xmm8
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm8 , 1 * SIZE(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $2 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	KERNEL1x1_1(xxx)
+	KERNEL1x1_2(xxx)
+	KERNEL1x1_3(xxx)
+	KERNEL1x1_4(xxx)
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB(xxx)
+	addq	$1, BI
+	addq	$1, %rax
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	vmovss	ALPHA, %xmm0
+
+#ifndef TRMMKERNEL
+
+	vfmaddss 	 (CO1),%xmm0, %xmm4,%xmm4
+
+#else
+	vmulss	%xmm0, %xmm4,%xmm4
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+
+
+#endif
diff --git a/kernel/x86_64/sgemm_kernel_16x4_sandy.S b/kernel/x86_64/sgemm_kernel_16x4_sandy.S
index ea15cd87e..2ee4b1554 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_sandy.S
+++ b/kernel/x86_64/sgemm_kernel_16x4_sandy.S
@@ -1,3167 +1,3167 @@
-/*********************************************************************************
-Copyright (c) 2013, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-**********************************************************************************/
-
-#define ASSEMBLER
-#include "common.h"
- 
-#define OLD_M	%rdi
-#define OLD_N	%rsi
-#define M	%r13
-#define J	%r14
-#define OLD_K	%rdx
-
-#define A	%rcx
-#define B	%r8
-#define C	%r9
-#define LDC	%r10
-	
-#define I	%r11
-#define AO	%rdi
-#define BO	%rsi
-#define	CO1	%r15
-#define K	%r12
-#define BI	%rbp
-#define	SP	%rbx
-
-#define BO1	%rdi
-#define	CO2	%rdx
-
-#ifndef WINDOWS_ABI
-
-#define STACKSIZE 96
-
-#else
-
-#define STACKSIZE 256
-
-#define OLD_A		40 + STACKSIZE(%rsp)
-#define OLD_B		48 + STACKSIZE(%rsp)
-#define OLD_C		56 + STACKSIZE(%rsp)
-#define OLD_LDC		64 + STACKSIZE(%rsp)
-#define OLD_OFFSET	72 + STACKSIZE(%rsp)
-
-#endif
-
-#define L_BUFFER_SIZE 8192
-
-#define Ndiv6	 24(%rsp)
-#define Nmod6	 32(%rsp)
-#define N	 40(%rsp)
-#define ALPHA	 48(%rsp)
-#define OFFSET	 56(%rsp)
-#define KK	 64(%rsp)
-#define KKK	 72(%rsp)
-#define BUFFER1	           128(%rsp)
-
-#if defined(OS_WINDOWS)
-#if   L_BUFFER_SIZE > 16384
-#define STACK_TOUCH \
-        movl    $0,  4096 * 4(%rsp);\
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 12288
-#define STACK_TOUCH \
-        movl    $0,  4096 * 3(%rsp);\
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 8192
-#define STACK_TOUCH \
-        movl    $0,  4096 * 2(%rsp);\
-        movl    $0,  4096 * 1(%rsp);
-#elif L_BUFFER_SIZE > 4096
-#define STACK_TOUCH \
-        movl    $0,  4096 * 1(%rsp);
-#else
-#define STACK_TOUCH
-#endif
-#else
-#define STACK_TOUCH
-#endif
-
-#define	A_PR1	512
-#define	B_PR1	512
-
-/*******************************************************************************************
-* 4 lines of N
-*******************************************************************************************/
-
-.macro KERNEL16x4_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	vmulps		%ymm2 , %ymm0 , %ymm12
-	vmulps		%ymm2 , %ymm1 , %ymm13
-	vmulps		%ymm3 , %ymm0 , %ymm14
-	vmulps		%ymm3 , %ymm1 , %ymm15
-	vaddps 	        %ymm12, %ymm4 , %ymm4
-	vaddps 	        %ymm13, %ymm5 , %ymm5
-	vaddps 	        %ymm14, %ymm6 , %ymm6
-	vaddps 	        %ymm15, %ymm7 , %ymm7
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
-	vmulps		%ymm2 , %ymm0 , %ymm12
-	vmulps		%ymm2 , %ymm1 , %ymm13
-	vmulps		%ymm3 , %ymm0 , %ymm14
-	vmulps		%ymm3 , %ymm1 , %ymm15
-	vaddps 	        %ymm12, %ymm8 , %ymm8
-	vaddps 	        %ymm13, %ymm9 , %ymm9
-	vaddps 	        %ymm14, %ymm10, %ymm10
-	vaddps 	        %ymm15, %ymm11, %ymm11
-	addq	$ 4 , BI	
-	addq	$ 16, %rax 
-.endm
-
-.macro SAVE16x4
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm7 , %ymm7
-	vmulps	%ymm0 , %ymm8 , %ymm8
-	vmulps	%ymm0 , %ymm9 , %ymm9
-	vmulps	%ymm0 , %ymm10, %ymm10
-	vmulps	%ymm0 , %ymm11, %ymm11
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
-
-	vaddps 	        (CO2), %ymm8,%ymm8
-	vaddps  8 * SIZE(CO2), %ymm9,%ymm9
-
-	vaddps 	        (CO2, LDC), %ymm10,%ymm10
-	vaddps  8 * SIZE(CO2, LDC), %ymm11,%ymm11
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
-
-	vmovups	%ymm8 ,  	(CO2)
-	vmovups	%ymm9 , 8 * SIZE(CO2)
-
-	vmovups	%ymm10,  	(CO2, LDC)
-	vmovups	%ymm11, 8 * SIZE(CO2, LDC)
-
-	prefetcht0	64(CO1)
-	prefetcht0	64(CO1, LDC)
-	prefetcht0	64(CO2)
-	prefetcht0	64(CO2, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x4_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	vmulps		%ymm2 , %ymm0 , %ymm12
-	vmulps		%ymm3 , %ymm0 , %ymm14
-	vaddps 	        %ymm12, %ymm4 , %ymm4
-	vaddps 	        %ymm14, %ymm6 , %ymm6
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
-	vmulps		%ymm2 , %ymm0 , %ymm12
-	vmulps		%ymm3 , %ymm0 , %ymm14
-	vaddps 	        %ymm12, %ymm8 , %ymm8
-	vaddps 	        %ymm14, %ymm10, %ymm10
-	addq	$ 4 , BI	
-	addq	$ 8 , %rax 
-.endm
-
-.macro SAVE8x4
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm8 , %ymm8
-	vmulps	%ymm0 , %ymm10, %ymm10
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps 	        (CO2), %ymm8,%ymm8
-	vaddps 	        (CO2, LDC), %ymm10,%ymm10
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm8 ,  	(CO2)
-	vmovups	%ymm10,  	(CO2, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x4_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	vmulps		%xmm2 , %xmm0 , %xmm12
-	vmulps		%xmm3 , %xmm0 , %xmm14
-	vaddps 	        %xmm12, %xmm4 , %xmm4
-	vaddps 	        %xmm14, %xmm6 , %xmm6
-	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	vmulps		%xmm2 , %xmm0 , %xmm12
-	vmulps		%xmm3 , %xmm0 , %xmm14
-	vaddps 	        %xmm12, %xmm8 , %xmm8
-	vaddps 	        %xmm14, %xmm10, %xmm10
-	addq	$ 4 , BI	
-	addq	$ 4 , %rax 
-.endm
-
-.macro SAVE4x4
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-	vmulps	%xmm0 , %xmm6 , %xmm6
-	vmulps	%xmm0 , %xmm8 , %xmm8
-	vmulps	%xmm0 , %xmm10, %xmm10
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-	vaddps 	        (CO2), %xmm8,%xmm8
-	vaddps 	        (CO2, LDC), %xmm10,%xmm10
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-	vmovups	%xmm8 ,  	(CO2)
-	vmovups	%xmm10,  	(CO2, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x4_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	vmulss		%xmm2 , %xmm0 , %xmm12
-	vmulss		%xmm2 , %xmm1 , %xmm13
-	vmulss		%xmm3 , %xmm0 , %xmm14
-	vmulss		%xmm3 , %xmm1 , %xmm15
-	vaddss 	        %xmm12, %xmm4 , %xmm4
-	vaddss 	        %xmm13, %xmm5 , %xmm5
-	vaddss 	        %xmm14, %xmm6 , %xmm6
-	vaddss 	        %xmm15, %xmm7 , %xmm7
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	vmulss		%xmm2 , %xmm0 , %xmm12
-	vmulss		%xmm2 , %xmm1 , %xmm13
-	vmulss		%xmm3 , %xmm0 , %xmm14
-	vmulss		%xmm3 , %xmm1 , %xmm15
-	vaddss 	        %xmm12, %xmm8 , %xmm8
-	vaddss 	        %xmm13, %xmm9 , %xmm9
-	vaddss 	        %xmm14, %xmm10, %xmm10
-	vaddss 	        %xmm15, %xmm11, %xmm11
-	addq	$ 4 , BI	
-	addq	$ 2, %rax 
-.endm
-
-.macro SAVE2x4
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm7 , %xmm7
-	vmulss	%xmm0 , %xmm8 , %xmm8
-	vmulss	%xmm0 , %xmm9 , %xmm9
-	vmulss	%xmm0 , %xmm10, %xmm10
-	vmulss	%xmm0 , %xmm11, %xmm11
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
-
-	vaddss 	        (CO2), %xmm8,%xmm8
-	vaddss  1 * SIZE(CO2), %xmm9,%xmm9
-
-	vaddss 	        (CO2, LDC), %xmm10,%xmm10
-	vaddss  1 * SIZE(CO2, LDC), %xmm11,%xmm11
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
-
-	vmovss	%xmm8 ,  	(CO2)
-	vmovss	%xmm9 , 1 * SIZE(CO2)
-
-	vmovss	%xmm10,  	(CO2, LDC)
-	vmovss	%xmm11, 1 * SIZE(CO2, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x4_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	vmulss		%xmm2 , %xmm0 , %xmm12
-	vmulss		%xmm3 , %xmm0 , %xmm14
-	vaddss 	        %xmm12, %xmm4 , %xmm4
-	vaddss 	        %xmm14, %xmm6 , %xmm6
-	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
-	vmulss		%xmm2 , %xmm0 , %xmm12
-	vmulss		%xmm3 , %xmm0 , %xmm14
-	vaddss 	        %xmm12, %xmm8 , %xmm8
-	vaddss 	        %xmm14, %xmm10, %xmm10
-	addq	$ 4 , BI	
-	addq	$ 1, %rax 
-.endm
-
-.macro SAVE1x4
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm8 , %xmm8
-	vmulss	%xmm0 , %xmm10, %xmm10
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss 	        (CO2), %xmm8,%xmm8
-	vaddss 	        (CO2, LDC), %xmm10,%xmm10
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm8 ,  	(CO2)
-	vmovss	%xmm10,  	(CO2, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 2 lines of N
-*******************************************************************************************/
-
-.macro KERNEL16x2_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	vmulps		%ymm2 , %ymm0 , %ymm12
-	vmulps		%ymm2 , %ymm1 , %ymm13
-	vmulps		%ymm3 , %ymm0 , %ymm14
-	vmulps		%ymm3 , %ymm1 , %ymm15
-	vaddps 	        %ymm12, %ymm4 , %ymm4
-	vaddps 	        %ymm13, %ymm5 , %ymm5
-	vaddps 	        %ymm14, %ymm6 , %ymm6
-	vaddps 	        %ymm15, %ymm7 , %ymm7
-	addq	$ 2 , BI	
-	addq	$ 16, %rax 
-.endm
-
-.macro SAVE16x2
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-	vmulps	%ymm0 , %ymm6 , %ymm6
-	vmulps	%ymm0 , %ymm7 , %ymm7
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-	vmovups	%ymm6 ,  	(CO1, LDC)
-	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x2_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
-	vmulps		%ymm2 , %ymm0 , %ymm12
-	vmulps		%ymm3 , %ymm0 , %ymm14
-	vaddps 	        %ymm12, %ymm4 , %ymm4
-	vaddps 	        %ymm14, %ymm6 , %ymm6
-	addq	$ 2 , BI	
-	addq	$ 8 , %rax 
-.endm
-
-.macro SAVE8x2
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm6 , %ymm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps 	        (CO1, LDC), %ymm6,%ymm6
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm6 ,  	(CO1, LDC)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x2_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	vmulps		%xmm2 , %xmm0 , %xmm12
-	vmulps		%xmm3 , %xmm0 , %xmm14
-	vaddps 	        %xmm12, %xmm4 , %xmm4
-	vaddps 	        %xmm14, %xmm6 , %xmm6
-	addq	$ 2 , BI	
-	addq	$ 4 , %rax 
-.endm
-
-.macro SAVE4x2
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-	vmulps	%xmm0 , %xmm6 , %xmm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-	vaddps 	        (CO1, LDC), %xmm6,%xmm6
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-	vmovups	%xmm6 ,  	(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x2_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	vmulss		%xmm2 , %xmm0 , %xmm12
-	vmulss		%xmm2 , %xmm1 , %xmm13
-	vmulss		%xmm3 , %xmm0 , %xmm14
-	vmulss		%xmm3 , %xmm1 , %xmm15
-	vaddss 	        %xmm12, %xmm4 , %xmm4
-	vaddss 	        %xmm13, %xmm5 , %xmm5
-	vaddss 	        %xmm14, %xmm6 , %xmm6
-	vaddss 	        %xmm15, %xmm7 , %xmm7
-	addq	$ 2 , BI	
-	addq	$ 2, %rax 
-.endm
-
-.macro SAVE2x2
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-	vmulss	%xmm0 , %xmm6 , %xmm6
-	vmulss	%xmm0 , %xmm7 , %xmm7
-
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-	vmovss	%xmm6 ,  	(CO1, LDC)
-	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x2_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
-	vmulss		%xmm2 , %xmm0 , %xmm12
-	vmulss		%xmm3 , %xmm0 , %xmm14
-	vaddss 	        %xmm12, %xmm4 , %xmm4
-	vaddss 	        %xmm14, %xmm6 , %xmm6
-	addq	$ 2 , BI	
-	addq	$ 1, %rax 
-.endm
-
-.macro SAVE1x2
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm6 , %xmm6
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss 	        (CO1, LDC), %xmm6,%xmm6
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm6 ,  	(CO1, LDC)
-
-.endm
-
-
-/*******************************************************************************************/
-
-/*******************************************************************************************
-* 1 line of N
-*******************************************************************************************/
-
-.macro KERNEL16x1_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vmulps		%ymm2 , %ymm0 , %ymm12
-	vmulps		%ymm2 , %ymm1 , %ymm13
-	vaddps 	        %ymm12, %ymm4 , %ymm4
-	vaddps 	        %ymm13, %ymm5 , %ymm5
-	addq	$ 1 , BI	
-	addq	$ 16, %rax 
-.endm
-
-.macro SAVE16x1
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-	vmulps	%ymm0 , %ymm5 , %ymm5
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-	vmovups	%ymm5 , 8 * SIZE(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL8x1_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
-	vmulps		%ymm2 , %ymm0 , %ymm12
-	vaddps 	        %ymm12, %ymm4 , %ymm4
-	addq	$ 1 , BI	
-	addq	$ 8 , %rax 
-.endm
-
-.macro SAVE8x1
-
-	vbroadcastss	ALPHA, %ymm0
-
-	vmulps	%ymm0 , %ymm4 , %ymm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %ymm4,%ymm4
-
-#endif
-
-	vmovups	%ymm4 ,  	(CO1)
-
-.endm
-
-
-
-/*******************************************************************************************/
-
-.macro KERNEL4x1_SUB
-	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmulps		%xmm2 , %xmm0 , %xmm12
-	vaddps 	        %xmm12, %xmm4 , %xmm4
-	addq	$ 1 , BI	
-	addq	$ 4 , %rax 
-.endm
-
-.macro SAVE4x1
-
-	vbroadcastss	ALPHA, %xmm0
-
-	vmulps	%xmm0 , %xmm4 , %xmm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddps 	        (CO1), %xmm4,%xmm4
-
-#endif
-
-	vmovups	%xmm4 ,  	(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL2x1_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmulss		%xmm2 , %xmm0 , %xmm12
-	vmulss		%xmm2 , %xmm1 , %xmm13
-	vaddss 	        %xmm12, %xmm4 , %xmm4
-	vaddss 	        %xmm13, %xmm5 , %xmm5
-	addq	$ 1 , BI	
-	addq	$ 2 , %rax 
-.endm
-
-.macro SAVE2x1
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-	vmulss	%xmm0 , %xmm5 , %xmm5
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-	vmovss	%xmm5 , 1 * SIZE(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-.macro KERNEL1x1_SUB
-	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
-	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
-	vmulss		%xmm2 , %xmm0 , %xmm12
-	vaddss 	        %xmm12, %xmm4 , %xmm4
-	addq	$ 1 , BI	
-	addq	$ 1 , %rax 
-.endm
-
-.macro SAVE1x1
-
-	vmovss	ALPHA, %xmm0
-
-	vmulss	%xmm0 , %xmm4 , %xmm4
-
-#if !defined(TRMMKERNEL)
-
-	vaddss 	        (CO1), %xmm4,%xmm4
-
-#endif
-
-	vmovss	%xmm4 ,  	(CO1)
-
-.endm
-
-
-/*******************************************************************************************/
-
-/*************************************************************************************
-* TRMM Kernel
-*************************************************************************************/
-
-
-	PROLOGUE
-	PROFCODE
-	
-	subq	$STACKSIZE, %rsp
-	movq	%rbx,   (%rsp)
-	movq	%rbp,  8(%rsp)
-	movq	%r12, 16(%rsp)
-	movq	%r13, 24(%rsp)
-	movq	%r14, 32(%rsp)
-	movq	%r15, 40(%rsp)
-
-	vzeroupper
-
-#ifdef WINDOWS_ABI
-	movq	%rdi,    48(%rsp)
-	movq	%rsi,    56(%rsp)
-	movups	%xmm6,   64(%rsp)
-	movups	%xmm7,   80(%rsp)
-	movups	%xmm8,   96(%rsp)
-	movups	%xmm9,  112(%rsp)
-	movups	%xmm10, 128(%rsp)
-	movups	%xmm11, 144(%rsp)
-	movups	%xmm12, 160(%rsp)
-	movups	%xmm13, 176(%rsp)
-	movups	%xmm14, 192(%rsp)
-	movups	%xmm15, 208(%rsp)
-
-	movq	ARG1,      OLD_M
-	movq	ARG2,      OLD_N
-	movq	ARG3,      OLD_K
-	movq	OLD_A,     A
-	movq	OLD_B,     B
-	movq	OLD_C,     C
-	movq	OLD_LDC,   LDC
-#ifdef TRMMKERNEL
-	vmovsd	OLD_OFFSET, %xmm12
-#endif
-	vmovaps	%xmm3, %xmm0
-
-#else
-	movq	STACKSIZE +  8(%rsp), LDC
-#ifdef TRMMKERNEL
-	movsd	STACKSIZE + 16(%rsp), %xmm12
-#endif
-
-#endif
-
-	movq    %rsp, SP      # save old stack
-        subq    $128 + L_BUFFER_SIZE, %rsp
-        andq    $-4096, %rsp    # align stack
-
-        STACK_TOUCH
-
-	cmpq	$0, OLD_M
-	je	.L999
-
-	cmpq	$0, OLD_N
-	je	.L999
-
-	cmpq	$0, OLD_K
-	je	.L999
-
-	movq	OLD_M, M
-	movq	OLD_N, N
-	movq	OLD_K, K
-
-	vmovss	 %xmm0, ALPHA
-
-	salq	$BASE_SHIFT, LDC
-
-	movq    N, %rax
-        xorq    %rdx, %rdx
-        movq    $4,  %rdi
-        divq    %rdi                    //    N / 4
-        movq    %rax, Ndiv6             //    N / 4
-        movq    %rdx, Nmod6             //    N % 4
-
-	
-
-#ifdef TRMMKERNEL
-	vmovsd	%xmm12, OFFSET
-	vmovsd	%xmm12, KK
-#ifndef LEFT
-	negq	KK
-#endif	
-#endif
-
-	movq	Ndiv6,  J
-	cmpq	$0, J
-	je	.L2_0
-	ALIGN_4
-
-/*******************************************************************************************/
-
-.L4_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L4_01b
-	ALIGN_4
-
-
-.L4_01a:
-        prefetcht0 512(BO1)
-        prefetchw  512(BO)
-
-	vmovups	       (BO1), %xmm0
-	vmovups	 4*SIZE(BO1), %xmm1
-	vmovups	 8*SIZE(BO1), %xmm2
-	vmovups	12*SIZE(BO1), %xmm3
-
-	vmovups	%xmm0,       (BO)
-	vmovups	%xmm1, 4*SIZE(BO)
-	vmovups	%xmm2, 8*SIZE(BO)
-	vmovups	%xmm3,12*SIZE(BO)
-
-	addq	$ 16*SIZE,BO1
-	addq	$ 16*SIZE,BO
-	decq	%rax
-	jnz	.L4_01a
-
-
-.L4_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L4_02d
-        ALIGN_4
-
-.L4_02c:
-
-	vmovups	(BO1), %xmm0
-	vmovups	%xmm0, (BO)
-	addq	$ 4*SIZE,BO1
-	addq	$ 4*SIZE,BO
-	decq	%rax
-	jnz	.L4_02c
-
-.L4_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L4_10:
-	movq	 C, CO1
-	leaq	(C, LDC, 2), CO2	
-	leaq	(C, LDC, 4), C		// c += 4 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$ 16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L4_20
-
-	ALIGN_4
-
-.L4_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             	// first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $4, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L4_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_12:
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	je	.L4_16
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	prefetcht0	B_PR1(BO, BI  , SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-	prefetcht0	A_PR1(AO, %rax, SIZE)
-	KERNEL16x4_SUB
-
-	je	.L4_16
-
-	jmp	.L4_12
-	ALIGN_4
-
-.L4_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_17:
-
-	KERNEL16x4_SUB
-
-	jl	.L4_17
-	ALIGN_4
-
-
-.L4_19:
-
-	SAVE16x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	addq	$16 * SIZE, CO2		# coffset += 16
-	decq	I			# i --
-	jg	.L4_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L4_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L4_60		// to next 3 lines of N
-
-	testq	$8, M		
-	jz	.L4_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L4_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_20_2:
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	je	.L4_20_6
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-	KERNEL8x4_SUB
-
-	je	.L4_20_6
-
-	jmp	.L4_20_2
-	ALIGN_4
-
-.L4_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_20_7:
-
-	KERNEL8x4_SUB
-
-	jl	.L4_20_7
-	ALIGN_4
-
-
-.L4_20_9:
-
-	SAVE8x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	addq	$8 * SIZE, CO2		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L4_21pre:
-
-	testq	$4, M		
-	jz	.L4_30
-	ALIGN_4
-
-.L4_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_22:
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	je	.L4_26
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-	KERNEL4x4_SUB
-
-	je	.L4_26
-
-	jmp	.L4_22
-	ALIGN_4
-
-.L4_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_27:
-
-	KERNEL4x4_SUB
-
-	jl	.L4_27
-	ALIGN_4
-
-
-.L4_29:
-
-	SAVE4x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	addq	$4 * SIZE, CO2		# coffset += 4
-	ALIGN_4
-	
-
-.L4_30:
-	testq	$2, M		
-	jz	.L4_40
-
-	ALIGN_4
-
-.L4_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L4_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_32:
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	je	.L4_36
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-	KERNEL2x4_SUB
-
-	je	.L4_36
-
-	jmp	.L4_32
-	ALIGN_4
-
-.L4_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_39
-
-	movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_37:
-
-	KERNEL2x4_SUB
-
-	jl	.L4_37
-	ALIGN_4
-
-
-.L4_39:
-
-	SAVE2x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	addq	$2 * SIZE, CO2		# coffset += 2
-	ALIGN_4
-
-.L4_40:
-	testq	$1, M		
-	jz	.L4_60		// to next 4 lines of N
-
-	ALIGN_4
-
-.L4_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $4, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L4_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_42:
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	je	.L4_46
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-	KERNEL1x4_SUB
-
-	je	.L4_46
-
-	jmp	.L4_42
-	ALIGN_4
-
-.L4_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L4_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L4_47:
-
-	KERNEL1x4_SUB
-
-	jl	.L4_47
-	ALIGN_4
-
-
-.L4_49:
-
-	SAVE1x4
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	addq	$1 * SIZE, CO2		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L4_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $4, KK
-#endif
-
-	decq	J			// j --
-	jg	.L4_01			// next 4 lines of N
-
-
-
-/*******************************************************************************************/
-.L2_0:
-
-	movq	Nmod6, J		
-	andq	$3, J			// j % 4
-	je	.L999
-
-	movq	Nmod6, J		
-	andq	$2, J			// j % 4
-	je	.L1_0
-
-.L2_01:
-
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	sarq	$2, %rax		// K / 4
-	jz	.L2_01b
-	ALIGN_4
-
-.L2_01a:
-
-	vmovsd	      (BO1), %xmm0
-	vmovsd	2*SIZE(BO1), %xmm1
-	vmovsd	4*SIZE(BO1), %xmm2
-	vmovsd	6*SIZE(BO1), %xmm3
-
-	vmovsd	%xmm0,       (BO)
-	vmovsd	%xmm1, 2*SIZE(BO)
-	vmovsd	%xmm2, 4*SIZE(BO)
-	vmovsd	%xmm3, 6*SIZE(BO)
-
-	addq	$8*SIZE,BO1
-	addq	$8*SIZE,BO
-	decq	%rax
-	jnz	.L2_01a
-
-
-.L2_01b:
-
-        movq    K, %rax
-        andq    $3, %rax                // K % 4
-        jz      .L2_02d
-        ALIGN_4
-
-.L2_02c:
-
-	vmovsd 	(BO1), %xmm0
-	vmovsd 	%xmm0, (BO)
-	addq	$2*SIZE,BO1
-	addq	$2*SIZE,BO
-	decq	%rax
-	jnz	.L2_02c
-
-.L2_02d:
-
-	movq	BO1, B			// next offset of B
-
-.L2_10:
-	movq	C, CO1
-	leaq	(C, LDC, 2), C		// c += 2 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L2_20
-
-	ALIGN_4
-
-.L2_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $2, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L2_16
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_12:
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	je	.L2_16
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-	KERNEL16x2_SUB
-
-	je	.L2_16
-
-	jmp	.L2_12
-	ALIGN_4
-
-.L2_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_19
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_17:
-
-	KERNEL16x2_SUB
-
-	jl	.L2_17
-	ALIGN_4
-
-
-.L2_19:
-
-	SAVE16x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L2_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L2_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L2_60		// to next 2 lines of N
-
-	testq	$8, M		
-	jz	.L2_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L2_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_20_6
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_2:
-
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	je	.L2_20_6
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-	KERNEL8x2_SUB
-
-	je	.L2_20_6
-
-	jmp	.L2_20_2
-	ALIGN_4
-
-.L2_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_20_9
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_20_7:
-
-	KERNEL8x2_SUB
-
-	jl	.L2_20_7
-	ALIGN_4
-
-
-.L2_20_9:
-
-	SAVE8x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L2_21pre:
-
-	testq	$4, M		
-	jz	.L2_30
-	ALIGN_4
-
-.L2_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_26
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_22:
-
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_26
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-	KERNEL4x2_SUB
-
-	je	.L2_26
-
-	jmp	.L2_22
-	ALIGN_4
-
-.L2_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_29
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_27:
-
-	KERNEL4x2_SUB
-
-	jl	.L2_27
-	ALIGN_4
-
-
-.L2_29:
-
-	SAVE4x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L2_30:
-	testq	$2, M		
-	jz	.L2_40
-
-	ALIGN_4
-
-.L2_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L2_36
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_32:
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_36
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-	KERNEL2x2_SUB
-
-	je	.L2_36
-
-	jmp	.L2_32
-	ALIGN_4
-
-.L2_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_39
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_37:
-
-	KERNEL2x2_SUB
-
-	jl	.L2_37
-	ALIGN_4
-
-
-.L2_39:
-
-	SAVE2x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L2_40:
-	testq	$1, M		
-	jz	.L2_60		// to next 2 lines of N
-
-	ALIGN_4
-
-.L2_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $2, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L2_46
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_42:
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_46
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-	KERNEL1x2_SUB
-
-	je	.L2_46
-
-	jmp	.L2_42
-	ALIGN_4
-
-.L2_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L2_49
-
-	movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L2_47:
-
-	KERNEL1x2_SUB
-
-	jl	.L2_47
-	ALIGN_4
-
-
-.L2_49:
-
-	SAVE1x2
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-
-
-	
-.L2_60:
-#if defined(TRMMKERNEL) && !defined(LEFT)
-        addq    $2, KK
-#endif
-
-
-
-
-.L1_0:
-
-/************************************************************************************************
-* Loop for Nmod6 % 2 > 0
-*************************************************************************************************/
-
-	movq	Nmod6, J		
-	andq	$1, J			// j % 2
-	je	.L999
-	ALIGN_4
-
-.L1_01:
-	// copy to sub buffer
-	movq	B, BO1
-	leaq    BUFFER1, BO		// first buffer to BO
-	movq	K, %rax
-	ALIGN_4
-
-.L1_02b:
-
-	vmovss	(BO1), %xmm0
-	vmovss	%xmm0,       (BO)
-	addq	$1*SIZE,BO1
-	addq	$1*SIZE,BO
-	decq	%rax
-	jnz	.L1_02b
-
-.L1_02c:
-
-	movq	BO1, B			// next offset of B
-
-.L1_10:
-	movq	C, CO1
-	leaq	(C, LDC, 1), C		// c += 1 * ldc
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        movq    OFFSET, %rax
-        movq    %rax, KK
-#endif
-	
-	movq	A, AO		 	// aoffset = a
-	addq	$16 * SIZE, AO
-
-	movq	M,  I
-	sarq	$4, I			// i = (m >> 4)
-	je	.L1_20
-
-	ALIGN_4
-
-.L1_11:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $16, %rax	// number of values in AO
-#else
-        addq    $1, %rax	// number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax			//  K = K - ( K % 8 )
-	je	.L1_16
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_12:
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	je	.L1_16
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-	KERNEL16x1_SUB
-
-	je	.L1_16
-
-	jmp	.L1_12
-	ALIGN_4
-
-.L1_16:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_19
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$4, %rax			// rax = rax * 16 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_17:
-
-	KERNEL16x1_SUB
-
-	jl	.L1_17
-	ALIGN_4
-
-
-.L1_19:
-
-	SAVE16x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $4, %rax                        // rax = rax * 16 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $16, KK				
-#endif
-
-	addq	$16 * SIZE, CO1		# coffset += 16
-	decq	I			# i --
-	jg	.L1_11
-	ALIGN_4	
-
-/**************************************************************************
-* Rest of M 
-***************************************************************************/
-.L1_20:
-	// Test rest of M
-
-	testq	$15, M
-	jz	.L999
-
-	testq	$8, M		
-	jz	.L1_21pre
-	ALIGN_4
-
-/**************************************************************************/
-
-.L1_20_1:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $8, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_20_6
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_2:
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	je	.L1_20_6
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-	KERNEL8x1_SUB
-
-	je	.L1_20_6
-
-	jmp	.L1_20_2
-	ALIGN_4
-
-.L1_20_6:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_20_9
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$3, %rax			// rax = rax * 8 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_20_7:
-
-	KERNEL8x1_SUB
-
-	jl	.L1_20_7
-	ALIGN_4
-
-
-.L1_20_9:
-
-	SAVE8x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $3, %rax                        // rax = rax * 8 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $8, KK
-#endif
-
-	addq	$8 * SIZE, CO1		# coffset += 8
-	ALIGN_4
-	
-
-
-/**************************************************************************/
-
-.L1_21pre:
-
-	testq	$4, M		
-	jz	.L1_30
-	ALIGN_4
-
-.L1_21:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $4, %rax        // number of values in A
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_26
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_22:
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_26
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-	KERNEL4x1_SUB
-
-	je	.L1_26
-
-	jmp	.L1_22
-	ALIGN_4
-
-.L1_26:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_29
-
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$2, %rax			// rax = rax * 4 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_27:
-
-	KERNEL4x1_SUB
-
-	jl	.L1_27
-	ALIGN_4
-
-
-.L1_29:
-
-	SAVE4x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $2, %rax                        // rax = rax * 4 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $4, KK
-#endif
-
-	addq	$4 * SIZE, CO1		# coffset += 4
-	ALIGN_4
-	
-
-.L1_30:
-	testq	$2, M		
-	jz	.L1_40
-
-	ALIGN_4
-
-.L1_31:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $2, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-
-	andq	$-8, %rax
-	je	.L1_36
-	movq    %rax, BI                        //  Index for BO
-
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_32:
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_36
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-	KERNEL2x1_SUB
-
-	je	.L1_36
-
-	jmp	.L1_32
-	ALIGN_4
-
-.L1_36:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_39
-
-	movq    %rax, BI                        //  Index for BO
-	
-	salq	$1, %rax			// rax = rax *2 ; number of values
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_37:
-
-	KERNEL2x1_SUB
-
-	jl	.L1_37
-	ALIGN_4
-
-
-.L1_39:
-
-	SAVE2x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        salq    $1, %rax                        // rax = rax * 2 ; number of values
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $2, KK
-#endif
-
-	addq	$2 * SIZE, CO1		# coffset += 2
-	ALIGN_4
-
-.L1_40:
-	testq	$1, M		
-	jz	.L999
-
-	ALIGN_4
-
-.L1_41:
-#if !defined(TRMMKERNEL) || \
-        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-#else
-        movq    KK, %rax
-        leaq    BUFFER1, BO             // first buffer to BO
-        addq    $4 * SIZE, BO
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO
-        leaq    (AO, %rax, SIZE), AO
-#endif
-
-
-	vzeroall
-
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-        movq    K, %rax
-        subq    KK, %rax
-        movq    %rax, KKK
-#else
-        movq    KK, %rax
-#ifdef LEFT
-        addq    $1, %rax        // number of values in AO
-#else
-        addq    $1, %rax        // number of values in BO
-#endif
-        movq    %rax, KKK
-#endif
-
-	andq	$-8, %rax
-	je	.L1_46
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_42:
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_46
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-	KERNEL1x1_SUB
-
-	je	.L1_46
-
-	jmp	.L1_42
-	ALIGN_4
-
-.L1_46:
-#ifndef TRMMKERNEL
-        movq    K, %rax
-#else
-        movq    KKK, %rax
-#endif
-
-	andq	$7, %rax		# if (k & 1)
-	je .L1_49
-
-	movq    %rax, BI                        //  Index for BO
-
-	leaq	(AO, %rax, SIZE), AO
-	leaq	(BO, BI, SIZE), BO
-	negq	BI
-	negq	%rax
-	ALIGN_4
-
-.L1_47:
-
-	KERNEL1x1_SUB
-
-	jl	.L1_47
-	ALIGN_4
-
-
-.L1_49:
-
-	SAVE1x1
-
-#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
-    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
-        movq    K, %rax 
-        subq    KKK, %rax
-        movq    %rax, BI                        //  Index for BO
-        leaq    (BO, BI, SIZE), BO         
-        leaq    (AO, %rax, SIZE), AO
-#endif  
-
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-        addq    $1, KK
-#endif
-
-	addq	$1 * SIZE, CO1		# coffset += 1
-	ALIGN_4
-	
-
-.L999:
-	movq   		SP, %rsp
-	movq	   (%rsp), %rbx
-	movq	  8(%rsp), %rbp
-	movq	 16(%rsp), %r12
-	movq	 24(%rsp), %r13
-	movq	 32(%rsp), %r14
-	movq	 40(%rsp), %r15
-
-#ifdef WINDOWS_ABI
-	movq	 48(%rsp), %rdi
-	movq	 56(%rsp), %rsi
-	movups	 64(%rsp), %xmm6
-	movups	 80(%rsp), %xmm7
-	movups	 96(%rsp), %xmm8
-	movups	112(%rsp), %xmm9
-	movups	128(%rsp), %xmm10
-	movups	144(%rsp), %xmm11
-	movups	160(%rsp), %xmm12
-	movups	176(%rsp), %xmm13
-	movups	192(%rsp), %xmm14
-	movups	208(%rsp), %xmm15
-#endif
-
-	addq	$STACKSIZE, %rsp
-	ret
-
-	EPILOGUE
-
-
-
-
-
+/*********************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define BI	%rbp
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define	CO2	%rdx
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+
+#else
+
+#define STACKSIZE 256
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+#define L_BUFFER_SIZE 8192
+
+#define Ndiv6	 24(%rsp)
+#define Nmod6	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $0,  4096 * 4(%rsp);\
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $0,  4096 * 3(%rsp);\
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $0,  4096 * 2(%rsp);\
+        movl    $0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+#define	A_PR1	512
+#define	B_PR1	512
+
+/*******************************************************************************************
+* 4 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	vmulps		%ymm2 , %ymm0 , %ymm12
+	vmulps		%ymm2 , %ymm1 , %ymm13
+	vmulps		%ymm3 , %ymm0 , %ymm14
+	vmulps		%ymm3 , %ymm1 , %ymm15
+	vaddps 	        %ymm12, %ymm4 , %ymm4
+	vaddps 	        %ymm13, %ymm5 , %ymm5
+	vaddps 	        %ymm14, %ymm6 , %ymm6
+	vaddps 	        %ymm15, %ymm7 , %ymm7
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	vmulps		%ymm2 , %ymm0 , %ymm12
+	vmulps		%ymm2 , %ymm1 , %ymm13
+	vmulps		%ymm3 , %ymm0 , %ymm14
+	vmulps		%ymm3 , %ymm1 , %ymm15
+	vaddps 	        %ymm12, %ymm8 , %ymm8
+	vaddps 	        %ymm13, %ymm9 , %ymm9
+	vaddps 	        %ymm14, %ymm10, %ymm10
+	vaddps 	        %ymm15, %ymm11, %ymm11
+	addq	$ 4 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x4
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm7 , %ymm7
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm9 , %ymm9
+	vmulps	%ymm0 , %ymm10, %ymm10
+	vmulps	%ymm0 , %ymm11, %ymm11
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
+
+	vaddps 	        (CO2), %ymm8,%ymm8
+	vaddps  8 * SIZE(CO2), %ymm9,%ymm9
+
+	vaddps 	        (CO2, LDC), %ymm10,%ymm10
+	vaddps  8 * SIZE(CO2, LDC), %ymm11,%ymm11
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
+
+	vmovups	%ymm8 ,  	(CO2)
+	vmovups	%ymm9 , 8 * SIZE(CO2)
+
+	vmovups	%ymm10,  	(CO2, LDC)
+	vmovups	%ymm11, 8 * SIZE(CO2, LDC)
+
+	prefetcht0	64(CO1)
+	prefetcht0	64(CO1, LDC)
+	prefetcht0	64(CO2)
+	prefetcht0	64(CO2, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	vmulps		%ymm2 , %ymm0 , %ymm12
+	vmulps		%ymm3 , %ymm0 , %ymm14
+	vaddps 	        %ymm12, %ymm4 , %ymm4
+	vaddps 	        %ymm14, %ymm6 , %ymm6
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %ymm3
+	vmulps		%ymm2 , %ymm0 , %ymm12
+	vmulps		%ymm3 , %ymm0 , %ymm14
+	vaddps 	        %ymm12, %ymm8 , %ymm8
+	vaddps 	        %ymm14, %ymm10, %ymm10
+	addq	$ 4 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x4
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm8 , %ymm8
+	vmulps	%ymm0 , %ymm10, %ymm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps 	        (CO2), %ymm8,%ymm8
+	vaddps 	        (CO2, LDC), %ymm10,%ymm10
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm8 ,  	(CO2)
+	vmovups	%ymm10,  	(CO2, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x4_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	vmulps		%xmm2 , %xmm0 , %xmm12
+	vmulps		%xmm3 , %xmm0 , %xmm14
+	vaddps 	        %xmm12, %xmm4 , %xmm4
+	vaddps 	        %xmm14, %xmm6 , %xmm6
+	vbroadcastss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	vmulps		%xmm2 , %xmm0 , %xmm12
+	vmulps		%xmm3 , %xmm0 , %xmm14
+	vaddps 	        %xmm12, %xmm8 , %xmm8
+	vaddps 	        %xmm14, %xmm10, %xmm10
+	addq	$ 4 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x4
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+	vmulps	%xmm0 , %xmm8 , %xmm8
+	vmulps	%xmm0 , %xmm10, %xmm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+	vaddps 	        (CO2), %xmm8,%xmm8
+	vaddps 	        (CO2, LDC), %xmm10,%xmm10
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm8 ,  	(CO2)
+	vmovups	%xmm10,  	(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x4_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	vmulss		%xmm2 , %xmm0 , %xmm12
+	vmulss		%xmm2 , %xmm1 , %xmm13
+	vmulss		%xmm3 , %xmm0 , %xmm14
+	vmulss		%xmm3 , %xmm1 , %xmm15
+	vaddss 	        %xmm12, %xmm4 , %xmm4
+	vaddss 	        %xmm13, %xmm5 , %xmm5
+	vaddss 	        %xmm14, %xmm6 , %xmm6
+	vaddss 	        %xmm15, %xmm7 , %xmm7
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	vmulss		%xmm2 , %xmm0 , %xmm12
+	vmulss		%xmm2 , %xmm1 , %xmm13
+	vmulss		%xmm3 , %xmm0 , %xmm14
+	vmulss		%xmm3 , %xmm1 , %xmm15
+	vaddss 	        %xmm12, %xmm8 , %xmm8
+	vaddss 	        %xmm13, %xmm9 , %xmm9
+	vaddss 	        %xmm14, %xmm10, %xmm10
+	vaddss 	        %xmm15, %xmm11, %xmm11
+	addq	$ 4 , BI	
+	addq	$ 2, %rax 
+.endm
+
+.macro SAVE2x4
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm9 , %xmm9
+	vmulss	%xmm0 , %xmm10, %xmm10
+	vmulss	%xmm0 , %xmm11, %xmm11
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+	vaddss 	        (CO2), %xmm8,%xmm8
+	vaddss  1 * SIZE(CO2), %xmm9,%xmm9
+
+	vaddss 	        (CO2, LDC), %xmm10,%xmm10
+	vaddss  1 * SIZE(CO2, LDC), %xmm11,%xmm11
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+	vmovss	%xmm8 ,  	(CO2)
+	vmovss	%xmm9 , 1 * SIZE(CO2)
+
+	vmovss	%xmm10,  	(CO2, LDC)
+	vmovss	%xmm11, 1 * SIZE(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x4_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	vmulss		%xmm2 , %xmm0 , %xmm12
+	vmulss		%xmm3 , %xmm0 , %xmm14
+	vaddss 	        %xmm12, %xmm4 , %xmm4
+	vaddss 	        %xmm14, %xmm6 , %xmm6
+	vmovss	 -2 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -1 * SIZE(BO, BI, SIZE), %xmm3
+	vmulss		%xmm2 , %xmm0 , %xmm12
+	vmulss		%xmm3 , %xmm0 , %xmm14
+	vaddss 	        %xmm12, %xmm8 , %xmm8
+	vaddss 	        %xmm14, %xmm10, %xmm10
+	addq	$ 4 , BI	
+	addq	$ 1, %rax 
+.endm
+
+.macro SAVE1x4
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm8 , %xmm8
+	vmulss	%xmm0 , %xmm10, %xmm10
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss 	        (CO2), %xmm8,%xmm8
+	vaddss 	        (CO2, LDC), %xmm10,%xmm10
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm8 ,  	(CO2)
+	vmovss	%xmm10,  	(CO2, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 2 lines of N
+*******************************************************************************************/
+
+.macro KERNEL16x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	vmulps		%ymm2 , %ymm0 , %ymm12
+	vmulps		%ymm2 , %ymm1 , %ymm13
+	vmulps		%ymm3 , %ymm0 , %ymm14
+	vmulps		%ymm3 , %ymm1 , %ymm15
+	vaddps 	        %ymm12, %ymm4 , %ymm4
+	vaddps 	        %ymm13, %ymm5 , %ymm5
+	vaddps 	        %ymm14, %ymm6 , %ymm6
+	vaddps 	        %ymm15, %ymm7 , %ymm7
+	addq	$ 2 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x2
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+	vmulps	%ymm0 , %ymm6 , %ymm6
+	vmulps	%ymm0 , %ymm7 , %ymm7
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+	vaddps  8 * SIZE(CO1, LDC), %ymm7,%ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+	vmovups	%ymm6 ,  	(CO1, LDC)
+	vmovups	%ymm7 , 8 * SIZE(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %ymm3
+	vmulps		%ymm2 , %ymm0 , %ymm12
+	vmulps		%ymm3 , %ymm0 , %ymm14
+	vaddps 	        %ymm12, %ymm4 , %ymm4
+	vaddps 	        %ymm14, %ymm6 , %ymm6
+	addq	$ 2 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x2
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm6 , %ymm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps 	        (CO1, LDC), %ymm6,%ymm6
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm6 ,  	(CO1, LDC)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x2_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vbroadcastss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	vmulps		%xmm2 , %xmm0 , %xmm12
+	vmulps		%xmm3 , %xmm0 , %xmm14
+	vaddps 	        %xmm12, %xmm4 , %xmm4
+	vaddps 	        %xmm14, %xmm6 , %xmm6
+	addq	$ 2 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x2
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+	vmulps	%xmm0 , %xmm6 , %xmm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+	vaddps 	        (CO1, LDC), %xmm6,%xmm6
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x2_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	vmulss		%xmm2 , %xmm0 , %xmm12
+	vmulss		%xmm2 , %xmm1 , %xmm13
+	vmulss		%xmm3 , %xmm0 , %xmm14
+	vmulss		%xmm3 , %xmm1 , %xmm15
+	vaddss 	        %xmm12, %xmm4 , %xmm4
+	vaddss 	        %xmm13, %xmm5 , %xmm5
+	vaddss 	        %xmm14, %xmm6 , %xmm6
+	vaddss 	        %xmm15, %xmm7 , %xmm7
+	addq	$ 2 , BI	
+	addq	$ 2, %rax 
+.endm
+
+.macro SAVE2x2
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+	vmulss	%xmm0 , %xmm6 , %xmm6
+	vmulss	%xmm0 , %xmm7 , %xmm7
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+	vaddss  1 * SIZE(CO1, LDC), %xmm7,%xmm7
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+	vmovss	%xmm6 ,  	(CO1, LDC)
+	vmovss	%xmm7 , 1 * SIZE(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x2_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmovss	 -3 * SIZE(BO, BI, SIZE), %xmm3
+	vmulss		%xmm2 , %xmm0 , %xmm12
+	vmulss		%xmm3 , %xmm0 , %xmm14
+	vaddss 	        %xmm12, %xmm4 , %xmm4
+	vaddss 	        %xmm14, %xmm6 , %xmm6
+	addq	$ 2 , BI	
+	addq	$ 1, %rax 
+.endm
+
+.macro SAVE1x2
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm6 , %xmm6
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss 	        (CO1, LDC), %xmm6,%xmm6
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm6 ,  	(CO1, LDC)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*******************************************************************************************
+* 1 line of N
+*******************************************************************************************/
+
+.macro KERNEL16x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm1
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vmulps		%ymm2 , %ymm0 , %ymm12
+	vmulps		%ymm2 , %ymm1 , %ymm13
+	vaddps 	        %ymm12, %ymm4 , %ymm4
+	vaddps 	        %ymm13, %ymm5 , %ymm5
+	addq	$ 1 , BI	
+	addq	$ 16, %rax 
+.endm
+
+.macro SAVE16x1
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+	vmulps	%ymm0 , %ymm5 , %ymm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+	vaddps  8 * SIZE(CO1), %ymm5,%ymm5
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 , 8 * SIZE(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL8x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %ymm2
+	vmulps		%ymm2 , %ymm0 , %ymm12
+	vaddps 	        %ymm12, %ymm4 , %ymm4
+	addq	$ 1 , BI	
+	addq	$ 8 , %rax 
+.endm
+
+.macro SAVE8x1
+
+	vbroadcastss	ALPHA, %ymm0
+
+	vmulps	%ymm0 , %ymm4 , %ymm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %ymm4,%ymm4
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+
+.endm
+
+
+
+/*******************************************************************************************/
+
+.macro KERNEL4x1_SUB
+	vmovups 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vbroadcastss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmulps		%xmm2 , %xmm0 , %xmm12
+	vaddps 	        %xmm12, %xmm4 , %xmm4
+	addq	$ 1 , BI	
+	addq	$ 4 , %rax 
+.endm
+
+.macro SAVE4x1
+
+	vbroadcastss	ALPHA, %xmm0
+
+	vmulps	%xmm0 , %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddps 	        (CO1), %xmm4,%xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL2x1_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss 	-15 * SIZE(AO, %rax, SIZE), %xmm1
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmulss		%xmm2 , %xmm0 , %xmm12
+	vmulss		%xmm2 , %xmm1 , %xmm13
+	vaddss 	        %xmm12, %xmm4 , %xmm4
+	vaddss 	        %xmm13, %xmm5 , %xmm5
+	addq	$ 1 , BI	
+	addq	$ 2 , %rax 
+.endm
+
+.macro SAVE2x1
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+	vmulss	%xmm0 , %xmm5 , %xmm5
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+	vaddss  1 * SIZE(CO1), %xmm5,%xmm5
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+	vmovss	%xmm5 , 1 * SIZE(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+.macro KERNEL1x1_SUB
+	vmovss 	-16 * SIZE(AO, %rax, SIZE), %xmm0
+	vmovss	 -4 * SIZE(BO, BI, SIZE), %xmm2
+	vmulss		%xmm2 , %xmm0 , %xmm12
+	vaddss 	        %xmm12, %xmm4 , %xmm4
+	addq	$ 1 , BI	
+	addq	$ 1 , %rax 
+.endm
+
+.macro SAVE1x1
+
+	vmovss	ALPHA, %xmm0
+
+	vmulss	%xmm0 , %xmm4 , %xmm4
+
+#if !defined(TRMMKERNEL)
+
+	vaddss 	        (CO1), %xmm4,%xmm4
+
+#endif
+
+	vmovss	%xmm4 ,  	(CO1)
+
+.endm
+
+
+/*******************************************************************************************/
+
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	movups	%xmm6,   64(%rsp)
+	movups	%xmm7,   80(%rsp)
+	movups	%xmm8,   96(%rsp)
+	movups	%xmm9,  112(%rsp)
+	movups	%xmm10, 128(%rsp)
+	movups	%xmm11, 144(%rsp)
+	movups	%xmm12, 160(%rsp)
+	movups	%xmm13, 176(%rsp)
+	movups	%xmm14, 192(%rsp)
+	movups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovaps	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	movsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$0, OLD_M
+	je	.L999
+
+	cmpq	$0, OLD_N
+	je	.L999
+
+	cmpq	$0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovss	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $4,  %rdi
+        divq    %rdi                    //    N / 4
+        movq    %rax, Ndiv6             //    N / 4
+        movq    %rdx, Nmod6             //    N % 4
+
+	
+
+#ifdef TRMMKERNEL
+	vmovsd	%xmm12, OFFSET
+	vmovsd	%xmm12, KK
+#ifndef LEFT
+	negq	KK
+#endif	
+#endif
+
+	movq	Ndiv6,  J
+	cmpq	$0, J
+	je	.L2_0
+	ALIGN_4
+
+/*******************************************************************************************/
+
+.L4_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L4_01b
+	ALIGN_4
+
+
+.L4_01a:
+        prefetcht0 512(BO1)
+        prefetchw  512(BO)
+
+	vmovups	       (BO1), %xmm0
+	vmovups	 4*SIZE(BO1), %xmm1
+	vmovups	 8*SIZE(BO1), %xmm2
+	vmovups	12*SIZE(BO1), %xmm3
+
+	vmovups	%xmm0,       (BO)
+	vmovups	%xmm1, 4*SIZE(BO)
+	vmovups	%xmm2, 8*SIZE(BO)
+	vmovups	%xmm3,12*SIZE(BO)
+
+	addq	$ 16*SIZE,BO1
+	addq	$ 16*SIZE,BO
+	decq	%rax
+	jnz	.L4_01a
+
+
+.L4_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L4_02d
+        ALIGN_4
+
+.L4_02c:
+
+	vmovups	(BO1), %xmm0
+	vmovups	%xmm0, (BO)
+	addq	$ 4*SIZE,BO1
+	addq	$ 4*SIZE,BO
+	decq	%rax
+	jnz	.L4_02c
+
+.L4_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L4_10:
+	movq	 C, CO1
+	leaq	(C, LDC, 2), CO2	
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$ 16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             	// first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $4, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L4_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4) , BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_12:
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	prefetcht0	B_PR1(BO, BI  , SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+	prefetcht0	A_PR1(AO, %rax, SIZE)
+	KERNEL16x4_SUB
+
+	je	.L4_16
+
+	jmp	.L4_12
+	ALIGN_4
+
+.L4_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL16x4_SUB
+
+	jl	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE16x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	addq	$16 * SIZE, CO2		# coffset += 16
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L4_60		// to next 3 lines of N
+
+	testq	$8, M		
+	jz	.L4_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L4_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_2:
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+	KERNEL8x4_SUB
+
+	je	.L4_20_6
+
+	jmp	.L4_20_2
+	ALIGN_4
+
+.L4_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_20_7:
+
+	KERNEL8x4_SUB
+
+	jl	.L4_20_7
+	ALIGN_4
+
+
+.L4_20_9:
+
+	SAVE8x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	addq	$8 * SIZE, CO2		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L4_21pre:
+
+	testq	$4, M		
+	jz	.L4_30
+	ALIGN_4
+
+.L4_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_22:
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+	KERNEL4x4_SUB
+
+	je	.L4_26
+
+	jmp	.L4_22
+	ALIGN_4
+
+.L4_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_27:
+
+	KERNEL4x4_SUB
+
+	jl	.L4_27
+	ALIGN_4
+
+
+.L4_29:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	addq	$4 * SIZE, CO2		# coffset += 4
+	ALIGN_4
+	
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L4_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	je	.L4_36
+
+	jmp	.L4_32
+	ALIGN_4
+
+.L4_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	jl	.L4_37
+	ALIGN_4
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	addq	$2 * SIZE, CO2		# coffset += 2
+	ALIGN_4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_60		// to next 4 lines of N
+
+	ALIGN_4
+
+.L4_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L4_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	je	.L4_46
+
+	jmp	.L4_42
+	ALIGN_4
+
+.L4_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (,BI,4), BI                   	//  BI = BI * 4 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	jl	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+	leaq	(,BI, 4), BI			// BI = BI * 4 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	addq	$1 * SIZE, CO2		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L4_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK
+#endif
+
+	decq	J			// j --
+	jg	.L4_01			// next 4 lines of N
+
+
+
+/*******************************************************************************************/
+.L2_0:
+
+	movq	Nmod6, J		
+	andq	$3, J			// j % 4
+	je	.L999
+
+	movq	Nmod6, J		
+	andq	$2, J			// j % 4
+	je	.L1_0
+
+.L2_01:
+
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	sarq	$2, %rax		// K / 4
+	jz	.L2_01b
+	ALIGN_4
+
+.L2_01a:
+
+	vmovsd	      (BO1), %xmm0
+	vmovsd	2*SIZE(BO1), %xmm1
+	vmovsd	4*SIZE(BO1), %xmm2
+	vmovsd	6*SIZE(BO1), %xmm3
+
+	vmovsd	%xmm0,       (BO)
+	vmovsd	%xmm1, 2*SIZE(BO)
+	vmovsd	%xmm2, 4*SIZE(BO)
+	vmovsd	%xmm3, 6*SIZE(BO)
+
+	addq	$8*SIZE,BO1
+	addq	$8*SIZE,BO
+	decq	%rax
+	jnz	.L2_01a
+
+
+.L2_01b:
+
+        movq    K, %rax
+        andq    $3, %rax                // K % 4
+        jz      .L2_02d
+        ALIGN_4
+
+.L2_02c:
+
+	vmovsd 	(BO1), %xmm0
+	vmovsd 	%xmm0, (BO)
+	addq	$2*SIZE,BO1
+	addq	$2*SIZE,BO
+	decq	%rax
+	jnz	.L2_02c
+
+.L2_02d:
+
+	movq	BO1, B			// next offset of B
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $2, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L2_16
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_12:
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+	KERNEL16x2_SUB
+
+	je	.L2_16
+
+	jmp	.L2_12
+	ALIGN_4
+
+.L2_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL16x2_SUB
+
+	jl	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE16x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L2_60		// to next 2 lines of N
+
+	testq	$8, M		
+	jz	.L2_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L2_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_20_6
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_2:
+
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+	KERNEL8x2_SUB
+
+	je	.L2_20_6
+
+	jmp	.L2_20_2
+	ALIGN_4
+
+.L2_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_20_9
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_20_7:
+
+	KERNEL8x2_SUB
+
+	jl	.L2_20_7
+	ALIGN_4
+
+
+.L2_20_9:
+
+	SAVE8x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L2_21pre:
+
+	testq	$4, M		
+	jz	.L2_30
+	ALIGN_4
+
+.L2_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_26
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_22:
+
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	je	.L2_26
+
+	jmp	.L2_22
+	ALIGN_4
+
+.L2_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_29
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_27:
+
+	KERNEL4x2_SUB
+
+	jl	.L2_27
+	ALIGN_4
+
+
+.L2_29:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L2_36
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	je	.L2_36
+
+	jmp	.L2_32
+	ALIGN_4
+
+.L2_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	jl	.L2_37
+	ALIGN_4
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_60		// to next 2 lines of N
+
+	ALIGN_4
+
+.L2_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L2_46
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	je	.L2_46
+
+	jmp	.L2_42
+	ALIGN_4
+
+.L2_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	jl	.L2_47
+	ALIGN_4
+
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+
+
+	
+.L2_60:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK
+#endif
+
+
+
+
+.L1_0:
+
+/************************************************************************************************
+* Loop for Nmod6 % 2 > 0
+*************************************************************************************************/
+
+	movq	Nmod6, J		
+	andq	$1, J			// j % 2
+	je	.L999
+	ALIGN_4
+
+.L1_01:
+	// copy to sub buffer
+	movq	B, BO1
+	leaq    BUFFER1, BO		// first buffer to BO
+	movq	K, %rax
+	ALIGN_4
+
+.L1_02b:
+
+	vmovss	(BO1), %xmm0
+	vmovss	%xmm0,       (BO)
+	addq	$1*SIZE,BO1
+	addq	$1*SIZE,BO
+	decq	%rax
+	jnz	.L1_02b
+
+.L1_02c:
+
+	movq	BO1, B			// next offset of B
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$4, I			// i = (m >> 4)
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $16, %rax	// number of values in AO
+#else
+        addq    $1, %rax	// number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax			//  K = K - ( K % 8 )
+	je	.L1_16
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_12:
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+	KERNEL16x1_SUB
+
+	je	.L1_16
+
+	jmp	.L1_12
+	ALIGN_4
+
+.L1_16:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$4, %rax			// rax = rax * 16 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL16x1_SUB
+
+	jl	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE16x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $4, %rax                        // rax = rax * 16 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $16, KK				
+#endif
+
+	addq	$16 * SIZE, CO1		# coffset += 16
+	decq	I			# i --
+	jg	.L1_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$15, M
+	jz	.L999
+
+	testq	$8, M		
+	jz	.L1_21pre
+	ALIGN_4
+
+/**************************************************************************/
+
+.L1_20_1:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $8, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_20_6
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_2:
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+	KERNEL8x1_SUB
+
+	je	.L1_20_6
+
+	jmp	.L1_20_2
+	ALIGN_4
+
+.L1_20_6:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_20_9
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$3, %rax			// rax = rax * 8 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_20_7:
+
+	KERNEL8x1_SUB
+
+	jl	.L1_20_7
+	ALIGN_4
+
+
+.L1_20_9:
+
+	SAVE8x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $3, %rax                        // rax = rax * 8 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $8, KK
+#endif
+
+	addq	$8 * SIZE, CO1		# coffset += 8
+	ALIGN_4
+	
+
+
+/**************************************************************************/
+
+.L1_21pre:
+
+	testq	$4, M		
+	jz	.L1_30
+	ALIGN_4
+
+.L1_21:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in A
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_26
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_22:
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	je	.L1_26
+
+	jmp	.L1_22
+	ALIGN_4
+
+.L1_26:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_29
+
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$2, %rax			// rax = rax * 4 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_27:
+
+	KERNEL4x1_SUB
+
+	jl	.L1_27
+	ALIGN_4
+
+
+.L1_29:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $2, %rax                        // rax = rax * 4 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK
+#endif
+
+	addq	$4 * SIZE, CO1		# coffset += 4
+	ALIGN_4
+	
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+
+	andq	$-8, %rax
+	je	.L1_36
+	movq    %rax, BI                        //  Index for BO
+
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	je	.L1_36
+
+	jmp	.L1_32
+	ALIGN_4
+
+.L1_36:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	movq    %rax, BI                        //  Index for BO
+	
+	salq	$1, %rax			// rax = rax *2 ; number of values
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	jl	.L1_37
+	ALIGN_4
+
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        salq    $1, %rax                        // rax = rax * 2 ; number of values
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK
+#endif
+
+	addq	$2 * SIZE, CO1		# coffset += 2
+	ALIGN_4
+
+.L1_40:
+	testq	$1, M		
+	jz	.L999
+
+	ALIGN_4
+
+.L1_41:
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+#else
+        movq    KK, %rax
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $4 * SIZE, BO
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO
+        leaq    (AO, %rax, SIZE), AO
+#endif
+
+
+	vzeroall
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	andq	$-8, %rax
+	je	.L1_46
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	je	.L1_46
+
+	jmp	.L1_42
+	ALIGN_4
+
+.L1_46:
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#else
+        movq    KKK, %rax
+#endif
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	movq    %rax, BI                        //  Index for BO
+
+	leaq	(AO, %rax, SIZE), AO
+	leaq	(BO, BI, SIZE), BO
+	negq	BI
+	negq	%rax
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	jl	.L1_47
+	ALIGN_4
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax 
+        subq    KKK, %rax
+        movq    %rax, BI                        //  Index for BO
+        leaq    (BO, BI, SIZE), BO         
+        leaq    (AO, %rax, SIZE), AO
+#endif  
+
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK
+#endif
+
+	addq	$1 * SIZE, CO1		# coffset += 1
+	ALIGN_4
+	
+
+.L999:
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	movups	 64(%rsp), %xmm6
+	movups	 80(%rsp), %xmm7
+	movups	 96(%rsp), %xmm8
+	movups	112(%rsp), %xmm9
+	movups	128(%rsp), %xmm10
+	movups	144(%rsp), %xmm11
+	movups	160(%rsp), %xmm12
+	movups	176(%rsp), %xmm13
+	movups	192(%rsp), %xmm14
+	movups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+
+
diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c
index 4e2cd4fe6..dbfcd55d7 100644
--- a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c
+++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c
@@ -1,279 +1,279 @@
-#include "common.h"
-#include <stdint.h>
-#include "strsm_kernel_8x4_haswell_R_common.h"
-
-#define SOLVE_RN_m8n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\
-  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\
-  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\
-  SAVE_SOLUTION_m8n2(4,5,0)\
-  SOLVE_leri_m8n2(40,6,7,%1)\
-  SOLVE_ri_m8n2(56,6,7,%1)\
-  SAVE_SOLUTION_m8n2(6,7,64)
-
-#define SOLVE_RN_m8n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\
-  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\
-  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\
-  SAVE_SOLUTION_m8n2(4,5,0)\
-  SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\
-  SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\
-  SAVE_SOLUTION_m8n2(6,7,64)\
-  SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\
-  SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\
-  SAVE_SOLUTION_m8n2(8,9,128)\
-  SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\
-  SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\
-  SAVE_SOLUTION_m8n2(10,11,192)
-
-#define SOLVE_RN_m8n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\
-  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(4,5,0)\
-  SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(6,7,64)\
-  SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(8,9,128)\
-  SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(10,11,192)\
-  SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(12,13,256)\
-  SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\
-  SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\
-  SAVE_SOLUTION_m8n2(14,15,320)
-
-#define SOLVE_RN_m4n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\
-  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\
-  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\
-  SAVE_SOLUTION_m4n2(4,0)\
-  SOLVE_leri_m4n2(40,5,%1)\
-  SOLVE_ri_m4n2(56,5,%1)\
-  SAVE_SOLUTION_m4n2(5,32)
-
-#define SOLVE_RN_m4n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\
-  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\
-  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m4n2(4,0)\
-  SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\
-  SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m4n2(5,32)\
-  SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\
-  SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m4n2(6,64)\
-  SOLVE_leri_m4n2(104,7,%1,%%r12,4)\
-  SOLVE_ri_m4n2(120,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m4n2(7,96)
-
-#define SOLVE_RN_m4n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\
-  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(4,0)\
-  SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(5,32)\
-  SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(6,64)\
-  SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(7,96)\
-  SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(8,128)\
-  SOLVE_leri_m4n2(168,9,%1,%%r12,8)\
-  SOLVE_ri_m4n2(184,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m4n2(9,160)
-
-#define SOLVE_RN_m2n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\
-  SOLVE_col1_ltor_m2n4(0,4,5,%1)\
-  SOLVE_col2_ltor_m2n4(16,4,5,%1)\
-  SOLVE_col3_ltor_m2n4(32,4,5,%1)\
-  SOLVE_col4_ltor_m2n4(48,4,5,%1)\
-  SAVE_SOLUTION_m2n4(4,5,0)
-
-#define SOLVE_RN_m2n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\
-  SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\
-  SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\
-  SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\
-  SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m2n4(4,5,0)\
-  SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\
-  SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\
-  SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\
-  SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\
-  SAVE_SOLUTION_m2n4(6,7,32)
-
-#define SOLVE_RN_m2n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\
-  SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\
-  SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\
-  SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\
-  SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m2n4(4,5,0)\
-  SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\
-  SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\
-  SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\
-  SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m2n4(6,7,32)\
-  SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\
-  SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\
-  SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\
-  SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\
-  SAVE_SOLUTION_m2n4(8,9,64)
-
-#define SOLVE_RN_m1n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\
-  SOLVE_col1_ltor_m1n4(0,4,%1)\
-  SOLVE_col2_ltor_m1n4(16,4,%1)\
-  SOLVE_col3_ltor_m1n4(32,4,%1)\
-  SOLVE_col4_ltor_m1n4(48,4,%1)\
-  SAVE_SOLUTION_m1n4(4,0)
-
-#define SOLVE_RN_m1n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\
-  SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\
-  SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\
-  SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\
-  SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\
-  SAVE_SOLUTION_m1n4(4,0)\
-  SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\
-  SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\
-  SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\
-  SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\
-  SAVE_SOLUTION_m1n4(5,16)
-
-#define SOLVE_RN_m1n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\
-  SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\
-  SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\
-  SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\
-  SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\
-  SAVE_SOLUTION_m1n4(4,0)\
-  SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\
-  SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\
-  SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\
-  SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\
-  SAVE_SOLUTION_m1n4(5,16)\
-  SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\
-  SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\
-  SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\
-  SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\
-  SAVE_SOLUTION_m1n4(6,32)
-
-#define GEMM_RN_SIMPLE(mdim,ndim) \
-  "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
-  "testq %5,%5; jz 1"#mdim""#ndim"2f;"\
-  "1"#mdim""#ndim"1:\n\t"\
-  GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\
-  "1"#mdim""#ndim"2:\n\t"
-#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4)
-#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8)
-#define GEMM_RN_m8n12 \
-  "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
-  "cmpq $8,%5; jb 18122f;"\
-  "18121:\n\t"\
-  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
-  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
-  "subq $8,%5; cmpq $8,%5; jnb 18121b;"\
-  "18122:\n\t"\
-  "testq %5,%5; jz 18124f;"\
-  "18123:\n\t"\
-  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\
-  "18124:\n\t"
-#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4)
-#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8)
-#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12)
-#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4)
-#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8)
-#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12)
-#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4)
-#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8)
-#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12)
-
-#define COMPUTE(ndim) {\
-  __asm__ __volatile__(\
-    "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\
-    "cmpq $8,%%r11; jb "#ndim"772f;"\
-    #ndim"771:\n\t"\
-    GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
-    #ndim"772:\n\t"\
-    "testq $4,%%r11; jz "#ndim"773f;"\
-    GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\
-    #ndim"773:\n\t"\
-    "testq $2,%%r11; jz "#ndim"774f;"\
-    GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\
-    #ndim"774:\n\t"\
-    "testq $1,%%r11; jz "#ndim"775f;"\
-    GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\
-    #ndim"775:\n\t"\
-    "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
-  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
-  :"r11","r12","r13","r14","r15","cc","memory",\
-  "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
-  a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\
-}
-
-static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
-  FLOAT a0, b0;
-  int i, j, k;
-  for (i=0; i<n; i++) {
-    b0 = b[i*n+i];
-    for (j=0; j<m; j++) {
-      a0 = c[i*ldc+j] * b0;
-      a[i*m+j] = c[i*ldc+j] = a0;
-      for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k];
-    }
-  }
-}
-static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
-  BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
-  for(;m_count>7;m_count-=8){
-    if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
-    solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
-    a_ptr += k * 8; c_ptr += 8;
-  }
-  for(;m_count>3;m_count-=4){
-    if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
-    solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
-    a_ptr += k * 4; c_ptr += 4;
-  }
-  for(;m_count>1;m_count-=2){
-    if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
-    solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
-    a_ptr += k * 2; c_ptr += 2;
-  }
-  if(m_count>0){
-    if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
-    solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
-    a_ptr += k * 1; c_ptr += 1;
-  }
-}
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
-  float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C;
-  float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-  float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
-  uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0;
-  BLASLONG n_count = n;
-  for(;n_count>11;n_count-=12) COMPUTE(12)
-  for(;n_count>7;n_count-=8) COMPUTE(8)
-  for(;n_count>3;n_count-=4) COMPUTE(4)
-  for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;}
-  if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF);
-  return 0;
-}
+#include "common.h"
+#include <stdint.h>
+#include "strsm_kernel_8x4_haswell_R_common.h"
+
+#define SOLVE_RN_m8n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\
+  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\
+  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\
+  SAVE_SOLUTION_m8n2(4,5,0)\
+  SOLVE_leri_m8n2(40,6,7,%1)\
+  SOLVE_ri_m8n2(56,6,7,%1)\
+  SAVE_SOLUTION_m8n2(6,7,64)
+
+#define SOLVE_RN_m8n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\
+  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\
+  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\
+  SAVE_SOLUTION_m8n2(4,5,0)\
+  SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\
+  SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\
+  SAVE_SOLUTION_m8n2(6,7,64)\
+  SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\
+  SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\
+  SAVE_SOLUTION_m8n2(8,9,128)\
+  SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\
+  SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\
+  SAVE_SOLUTION_m8n2(10,11,192)
+
+#define SOLVE_RN_m8n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\
+  SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(4,5,0)\
+  SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(6,7,64)\
+  SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(8,9,128)\
+  SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(10,11,192)\
+  SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(12,13,256)\
+  SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\
+  SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\
+  SAVE_SOLUTION_m8n2(14,15,320)
+
+#define SOLVE_RN_m4n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\
+  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\
+  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\
+  SAVE_SOLUTION_m4n2(4,0)\
+  SOLVE_leri_m4n2(40,5,%1)\
+  SOLVE_ri_m4n2(56,5,%1)\
+  SAVE_SOLUTION_m4n2(5,32)
+
+#define SOLVE_RN_m4n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\
+  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\
+  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m4n2(4,0)\
+  SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\
+  SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m4n2(5,32)\
+  SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\
+  SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m4n2(6,64)\
+  SOLVE_leri_m4n2(104,7,%1,%%r12,4)\
+  SOLVE_ri_m4n2(120,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m4n2(7,96)
+
+#define SOLVE_RN_m4n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\
+  SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(4,0)\
+  SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(5,32)\
+  SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(6,64)\
+  SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(7,96)\
+  SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(8,128)\
+  SOLVE_leri_m4n2(168,9,%1,%%r12,8)\
+  SOLVE_ri_m4n2(184,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m4n2(9,160)
+
+#define SOLVE_RN_m2n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\
+  SOLVE_col1_ltor_m2n4(0,4,5,%1)\
+  SOLVE_col2_ltor_m2n4(16,4,5,%1)\
+  SOLVE_col3_ltor_m2n4(32,4,5,%1)\
+  SOLVE_col4_ltor_m2n4(48,4,5,%1)\
+  SAVE_SOLUTION_m2n4(4,5,0)
+
+#define SOLVE_RN_m2n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\
+  SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\
+  SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\
+  SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\
+  SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m2n4(4,5,0)\
+  SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\
+  SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\
+  SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\
+  SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\
+  SAVE_SOLUTION_m2n4(6,7,32)
+
+#define SOLVE_RN_m2n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\
+  SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\
+  SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\
+  SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\
+  SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m2n4(4,5,0)\
+  SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\
+  SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\
+  SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\
+  SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m2n4(6,7,32)\
+  SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\
+  SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\
+  SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\
+  SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\
+  SAVE_SOLUTION_m2n4(8,9,64)
+
+#define SOLVE_RN_m1n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\
+  SOLVE_col1_ltor_m1n4(0,4,%1)\
+  SOLVE_col2_ltor_m1n4(16,4,%1)\
+  SOLVE_col3_ltor_m1n4(32,4,%1)\
+  SOLVE_col4_ltor_m1n4(48,4,%1)\
+  SAVE_SOLUTION_m1n4(4,0)
+
+#define SOLVE_RN_m1n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\
+  SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\
+  SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\
+  SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\
+  SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\
+  SAVE_SOLUTION_m1n4(4,0)\
+  SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\
+  SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\
+  SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\
+  SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\
+  SAVE_SOLUTION_m1n4(5,16)
+
+#define SOLVE_RN_m1n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\
+  SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\
+  SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\
+  SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\
+  SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\
+  SAVE_SOLUTION_m1n4(4,0)\
+  SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\
+  SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\
+  SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\
+  SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\
+  SAVE_SOLUTION_m1n4(5,16)\
+  SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\
+  SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\
+  SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\
+  SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\
+  SAVE_SOLUTION_m1n4(6,32)
+
+#define GEMM_RN_SIMPLE(mdim,ndim) \
+  "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
+  "testq %5,%5; jz 1"#mdim""#ndim"2f;"\
+  "1"#mdim""#ndim"1:\n\t"\
+  GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\
+  "1"#mdim""#ndim"2:\n\t"
+#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4)
+#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8)
+#define GEMM_RN_m8n12 \
+  "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
+  "cmpq $8,%5; jb 18122f;"\
+  "18121:\n\t"\
+  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
+  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
+  "subq $8,%5; cmpq $8,%5; jnb 18121b;"\
+  "18122:\n\t"\
+  "testq %5,%5; jz 18124f;"\
+  "18123:\n\t"\
+  GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\
+  "18124:\n\t"
+#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4)
+#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8)
+#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12)
+#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4)
+#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8)
+#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12)
+#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4)
+#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8)
+#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12)
+
+#define COMPUTE(ndim) {\
+  __asm__ __volatile__(\
+    "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\
+    "cmpq $8,%%r11; jb "#ndim"772f;"\
+    #ndim"771:\n\t"\
+    GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
+    #ndim"772:\n\t"\
+    "testq $4,%%r11; jz "#ndim"773f;"\
+    GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\
+    #ndim"773:\n\t"\
+    "testq $2,%%r11; jz "#ndim"774f;"\
+    GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\
+    #ndim"774:\n\t"\
+    "testq $1,%%r11; jz "#ndim"775f;"\
+    GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\
+    #ndim"775:\n\t"\
+    "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
+  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
+  :"r11","r12","r13","r14","r15","cc","memory",\
+  "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
+  a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\
+}
+
+static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+  FLOAT a0, b0;
+  int i, j, k;
+  for (i=0; i<n; i++) {
+    b0 = b[i*n+i];
+    for (j=0; j<m; j++) {
+      a0 = c[i*ldc+j] * b0;
+      a[i*m+j] = c[i*ldc+j] = a0;
+      for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k];
+    }
+  }
+}
+static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
+  BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
+  for(;m_count>7;m_count-=8){
+    if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
+    solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
+    a_ptr += k * 8; c_ptr += 8;
+  }
+  for(;m_count>3;m_count-=4){
+    if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
+    solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
+    a_ptr += k * 4; c_ptr += 4;
+  }
+  for(;m_count>1;m_count-=2){
+    if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
+    solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
+    a_ptr += k * 2; c_ptr += 2;
+  }
+  if(m_count>0){
+    if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
+    solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
+    a_ptr += k * 1; c_ptr += 1;
+  }
+}
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
+  float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C;
+  float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+  float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
+  uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0;
+  BLASLONG n_count = n;
+  for(;n_count>11;n_count-=12) COMPUTE(12)
+  for(;n_count>7;n_count-=8) COMPUTE(8)
+  for(;n_count>3;n_count-=4) COMPUTE(4)
+  for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;}
+  if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF);
+  return 0;
+}
diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c
index ffcbfbbf0..9de3354de 100644
--- a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c
+++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c
@@ -1,281 +1,281 @@
-#include "common.h"
-#include <stdint.h>
-#include "strsm_kernel_8x4_haswell_R_common.h"
-
-#define SOLVE_RT_m8n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
-  SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
-  SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
-  SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-48,4,5,%1)\
-  SOLVE_le_m8n2(-64,4,5,%1)\
-  SAVE_SOLUTION_m8n2(4,5,-128)
-
-#define SOLVE_RT_m8n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
-  SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
-  SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
-  SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
-  SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
-  SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
-  SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
-  SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-112,4,5,%1)\
-  SOLVE_le_m8n2(-128,4,5,%1)\
-  SAVE_SOLUTION_m8n2(4,5,-256)
-
-#define SOLVE_RT_m8n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
-  SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
-  SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
-  SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
-  SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
-  SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
-  SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
-  SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\
-  SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\
-  SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\
-  SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\
-  SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m8n2(-176,4,5,%1)\
-  SOLVE_le_m8n2(-192,4,5,%1)\
-  SAVE_SOLUTION_m8n2(4,5,-384)
-
-#define SOLVE_RT_m4n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
-  SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
-  SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
-  SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-48,4,%1)\
-  SOLVE_le_m4n2(-64,4,%1)\
-  SAVE_SOLUTION_m4n2(4,-64)
-
-#define SOLVE_RT_m4n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
-  SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
-  SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
-  SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
-  SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
-  SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
-  SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
-  SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-112,4,%1)\
-  SOLVE_le_m4n2(-128,4,%1)\
-  SAVE_SOLUTION_m4n2(4,-128)
-
-#define SOLVE_RT_m4n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
-  SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
-  SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
-  SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
-  SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
-  SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
-  SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
-  SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\
-  SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\
-  SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\
-  SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\
-  SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
-  SOLVE_rile_m4n2(-176,4,%1)\
-  SOLVE_le_m4n2(-192,4,%1)\
-  SAVE_SOLUTION_m4n2(4,-192)
-
-#define SOLVE_RT_m2n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
-  SOLVE_col4_rtol_m2n4(-16,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-32,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-48,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-64,4,5,%1)\
-  SAVE_SOLUTION_m2n4(4,5,-32)
-
-#define SOLVE_RT_m2n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
-  SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
-  SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m2n4(-80,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-96,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-112,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-128,4,5,%1)\
-  SAVE_SOLUTION_m2n4(4,5,-64)
-
-#define SOLVE_RT_m2n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
-  SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
-  SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\
-  SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m2n4(-144,4,5,%1)\
-  SOLVE_col3_rtol_m2n4(-160,4,5,%1)\
-  SOLVE_col2_rtol_m2n4(-176,4,5,%1)\
-  SOLVE_col1_rtol_m2n4(-192,4,5,%1)\
-  SAVE_SOLUTION_m2n4(4,5,-96)
-
-#define SOLVE_RT_m1n4 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
-  SOLVE_col4_rtol_m1n4(-16,4,%1)\
-  SOLVE_col3_rtol_m1n4(-32,4,%1)\
-  SOLVE_col2_rtol_m1n4(-48,4,%1)\
-  SOLVE_col1_rtol_m1n4(-64,4,%1)\
-  SAVE_SOLUTION_m1n4(4,-16)
-
-#define SOLVE_RT_m1n8 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
-  SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
-  SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
-  SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
-  SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
-  SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m1n4(-80,4,%1)\
-  SOLVE_col3_rtol_m1n4(-96,4,%1)\
-  SOLVE_col2_rtol_m1n4(-112,4,%1)\
-  SOLVE_col1_rtol_m1n4(-128,4,%1)\
-  SAVE_SOLUTION_m1n4(4,-32)
-
-#define SOLVE_RT_m1n12 \
-  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
-  SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
-  SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
-  SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
-  SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
-  SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\
-  SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\
-  SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\
-  SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\
-  SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
-  SOLVE_col4_rtol_m1n4(-144,4,%1)\
-  SOLVE_col3_rtol_m1n4(-160,4,%1)\
-  SOLVE_col2_rtol_m1n4(-176,4,%1)\
-  SOLVE_col1_rtol_m1n4(-192,4,%1)\
-  SAVE_SOLUTION_m1n4(4,-48)
-
-/* r14 = b_tail, r15 = a_tail, r13 = k-kk */
-#define GEMM_RT_SIMPLE(mdim,ndim) \
-  "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
-  "testq %5,%5; jz 1"#mdim""#ndim"2f;"\
-  "1"#mdim""#ndim"1:\n\t"\
-  "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\
-  "1"#mdim""#ndim"2:\n\t"
-#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4)
-#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8)
-#define GEMM_RT_m8n12 \
-  "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
-  "cmpq $8,%5; jb 18122f;"\
-  "18121:\n\t"\
-  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
-  "subq $8,%5; cmpq $8,%5; jnb 18121b;"\
-  "18122:\n\t"\
-  "testq %5,%5; jz 18124f;"\
-  "18123:\n\t"\
-  "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\
-  "18124:\n\t"
-#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4)
-#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8)
-#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12)
-#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4)
-#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8)
-#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12)
-#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4)
-#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8)
-#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12)
-
-#define COMPUTE(ndim) {\
-  b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\
-  __asm__ __volatile__(\
-    "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\
-    "cmpq $8,%%r11; jb "#ndim"772f;"\
-    #ndim"771:\n\t"\
-    GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
-    #ndim"772:\n\t"\
-    "testq $4,%%r11; jz "#ndim"773f;"\
-    GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\
-    #ndim"773:\n\t"\
-    "testq $2,%%r11; jz "#ndim"774f;"\
-    GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\
-    #ndim"774:\n\t"\
-    "testq $1,%%r11; jz "#ndim"775f;"\
-    GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\
-    #ndim"775:\n\t"\
-    "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
-  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
-  :"r11","r12","r13","r14","r15","cc","memory",\
-  "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
-  a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\
-}
-
-static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){
-  FLOAT a0, b0;
-  int i, j, k;
-  for (i=n-1;i>=0;i--) {
-    b0 = b[i*n+i];
-    for (j=0;j<m;j++) {
-      a0 = c[i*ldc+j] * b0;
-      a[i*m+j] = c[i*ldc+j] = a0;
-      for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k];
-    }
-  }
-}
-static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
-  BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
-  for(;m_count>7;m_count-=8){
-    if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
-    solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc);
-    a_ptr += k * 8; c_ptr += 8;
-  }
-  for(;m_count>3;m_count-=4){
-    if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
-    solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc);
-    a_ptr += k * 4; c_ptr += 4;
-  }
-  for(;m_count>1;m_count-=2){
-    if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
-    solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc);
-    a_ptr += k * 2; c_ptr += 2;
-  }
-  if(m_count>0){
-    if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
-    solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc);
-    a_ptr += k * 1; c_ptr += 1;
-  }
-}
-int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
-  float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C;
-  float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-  float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
-  uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0;
-  BLASLONG n_count = n;
-  if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;}
-  if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;}
-  for(;n_count>11;n_count-=12) COMPUTE(12)
-  for(;n_count>7;n_count-=8) COMPUTE(8)
-  for(;n_count>3;n_count-=4) COMPUTE(4)
-  return 0;
-}
+#include "common.h"
+#include <stdint.h>
+#include "strsm_kernel_8x4_haswell_R_common.h"
+
+#define SOLVE_RT_m8n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
+  SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
+  SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
+  SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-48,4,5,%1)\
+  SOLVE_le_m8n2(-64,4,5,%1)\
+  SAVE_SOLUTION_m8n2(4,5,-128)
+
+#define SOLVE_RT_m8n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
+  SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
+  SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
+  SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
+  SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
+  SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
+  SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
+  SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-112,4,5,%1)\
+  SOLVE_le_m8n2(-128,4,5,%1)\
+  SAVE_SOLUTION_m8n2(4,5,-256)
+
+#define SOLVE_RT_m8n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
+  SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
+  SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
+  SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
+  SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
+  SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
+  SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
+  SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\
+  SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\
+  SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\
+  SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\
+  SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m8n2(-176,4,5,%1)\
+  SOLVE_le_m8n2(-192,4,5,%1)\
+  SAVE_SOLUTION_m8n2(4,5,-384)
+
+#define SOLVE_RT_m4n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
+  SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
+  SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
+  SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-48,4,%1)\
+  SOLVE_le_m4n2(-64,4,%1)\
+  SAVE_SOLUTION_m4n2(4,-64)
+
+#define SOLVE_RT_m4n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
+  SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
+  SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
+  SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
+  SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
+  SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
+  SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
+  SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-112,4,%1)\
+  SOLVE_le_m4n2(-128,4,%1)\
+  SAVE_SOLUTION_m4n2(4,-128)
+
+#define SOLVE_RT_m4n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
+  SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
+  SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
+  SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
+  SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
+  SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
+  SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
+  SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\
+  SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\
+  SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\
+  SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\
+  SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
+  SOLVE_rile_m4n2(-176,4,%1)\
+  SOLVE_le_m4n2(-192,4,%1)\
+  SAVE_SOLUTION_m4n2(4,-192)
+
+#define SOLVE_RT_m2n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
+  SOLVE_col4_rtol_m2n4(-16,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-32,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-48,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-64,4,5,%1)\
+  SAVE_SOLUTION_m2n4(4,5,-32)
+
+#define SOLVE_RT_m2n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
+  SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
+  SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m2n4(-80,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-96,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-112,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-128,4,5,%1)\
+  SAVE_SOLUTION_m2n4(4,5,-64)
+
+#define SOLVE_RT_m2n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
+  SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
+  SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\
+  SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m2n4(-144,4,5,%1)\
+  SOLVE_col3_rtol_m2n4(-160,4,5,%1)\
+  SOLVE_col2_rtol_m2n4(-176,4,5,%1)\
+  SOLVE_col1_rtol_m2n4(-192,4,5,%1)\
+  SAVE_SOLUTION_m2n4(4,5,-96)
+
+#define SOLVE_RT_m1n4 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
+  SOLVE_col4_rtol_m1n4(-16,4,%1)\
+  SOLVE_col3_rtol_m1n4(-32,4,%1)\
+  SOLVE_col2_rtol_m1n4(-48,4,%1)\
+  SOLVE_col1_rtol_m1n4(-64,4,%1)\
+  SAVE_SOLUTION_m1n4(4,-16)
+
+#define SOLVE_RT_m1n8 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
+  SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
+  SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
+  SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
+  SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
+  SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m1n4(-80,4,%1)\
+  SOLVE_col3_rtol_m1n4(-96,4,%1)\
+  SOLVE_col2_rtol_m1n4(-112,4,%1)\
+  SOLVE_col1_rtol_m1n4(-128,4,%1)\
+  SAVE_SOLUTION_m1n4(4,-32)
+
+#define SOLVE_RT_m1n12 \
+  "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
+  SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
+  SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
+  SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
+  SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
+  SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\
+  SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\
+  SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\
+  SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\
+  SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
+  SOLVE_col4_rtol_m1n4(-144,4,%1)\
+  SOLVE_col3_rtol_m1n4(-160,4,%1)\
+  SOLVE_col2_rtol_m1n4(-176,4,%1)\
+  SOLVE_col1_rtol_m1n4(-192,4,%1)\
+  SAVE_SOLUTION_m1n4(4,-48)
+
+/* r14 = b_tail, r15 = a_tail, r13 = k-kk */
+#define GEMM_RT_SIMPLE(mdim,ndim) \
+  "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
+  "testq %5,%5; jz 1"#mdim""#ndim"2f;"\
+  "1"#mdim""#ndim"1:\n\t"\
+  "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\
+  "1"#mdim""#ndim"2:\n\t"
+#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4)
+#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8)
+#define GEMM_RT_m8n12 \
+  "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
+  "cmpq $8,%5; jb 18122f;"\
+  "18121:\n\t"\
+  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+  "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+                       "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
+  "subq $8,%5; cmpq $8,%5; jnb 18121b;"\
+  "18122:\n\t"\
+  "testq %5,%5; jz 18124f;"\
+  "18123:\n\t"\
+  "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\
+  "18124:\n\t"
+#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4)
+#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8)
+#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12)
+#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4)
+#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8)
+#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12)
+#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4)
+#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8)
+#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12)
+
+#define COMPUTE(ndim) {\
+  b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\
+  __asm__ __volatile__(\
+    "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\
+    "cmpq $8,%%r11; jb "#ndim"772f;"\
+    #ndim"771:\n\t"\
+    GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
+    #ndim"772:\n\t"\
+    "testq $4,%%r11; jz "#ndim"773f;"\
+    GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\
+    #ndim"773:\n\t"\
+    "testq $2,%%r11; jz "#ndim"774f;"\
+    GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\
+    #ndim"774:\n\t"\
+    "testq $1,%%r11; jz "#ndim"775f;"\
+    GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\
+    #ndim"775:\n\t"\
+    "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
+  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
+  :"r11","r12","r13","r14","r15","cc","memory",\
+  "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
+  a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\
+}
+
+static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){
+  FLOAT a0, b0;
+  int i, j, k;
+  for (i=n-1;i>=0;i--) {
+    b0 = b[i*n+i];
+    for (j=0;j<m;j++) {
+      a0 = c[i*ldc+j] * b0;
+      a[i*m+j] = c[i*ldc+j] = a0;
+      for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k];
+    }
+  }
+}
+static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
+  BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
+  for(;m_count>7;m_count-=8){
+    if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
+    solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc);
+    a_ptr += k * 8; c_ptr += 8;
+  }
+  for(;m_count>3;m_count-=4){
+    if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
+    solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc);
+    a_ptr += k * 4; c_ptr += 4;
+  }
+  for(;m_count>1;m_count-=2){
+    if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
+    solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc);
+    a_ptr += k * 2; c_ptr += 2;
+  }
+  if(m_count>0){
+    if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
+    solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc);
+    a_ptr += k * 1; c_ptr += 1;
+  }
+}
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
+  float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C;
+  float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+  float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
+  uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0;
+  BLASLONG n_count = n;
+  if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;}
+  if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;}
+  for(;n_count>11;n_count-=12) COMPUTE(12)
+  for(;n_count>7;n_count-=8) COMPUTE(8)
+  for(;n_count>3;n_count-=4) COMPUTE(4)
+  return 0;
+}

From f92dd6e303da101107246a0274b172659f8db3ad Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Nov 2022 10:18:36 +0100
Subject: [PATCH 102/154] change line endings from CRLF to LF

---
 benchmark/Makefile | 6878 ++++++++++++++++++++++----------------------
 benchmark/amax.c   |  266 +-
 benchmark/amin.c   |  274 +-
 benchmark/hbmv.c   |  268 +-
 benchmark/hpmv.c   |  266 +-
 benchmark/iamin.c  |  240 +-
 benchmark/imax.c   |  228 +-
 benchmark/imin.c   |  228 +-
 benchmark/max.c    |  226 +-
 benchmark/min.c    |  226 +-
 benchmark/rotm.c   |  276 +-
 benchmark/spmv.c   |  292 +-
 12 files changed, 4834 insertions(+), 4834 deletions(-)

diff --git a/benchmark/Makefile b/benchmark/Makefile
index f2f3b354a..d9ddb9042 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -1,3439 +1,3439 @@
-TOPDIR	= ..
-include $(TOPDIR)/Makefile.system
-
-# ACML standard
-#ACML=/opt/acml5.3.1/gfortran64_mp/lib
-#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
-
-# ACML custom
-#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
-#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
-
-# ACML 6.1 custom
-ACML=/home/saar/acml6.1/gfortran64_mp/lib
-LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm
-
-
-# Atlas Ubuntu
-#ATLAS=/usr/lib/atlas-base
-#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a  $(ATLAS)/libptcblas.a  $(ATLAS)/libptf77blas.a  $(ATLAS)/libatlas.a -lgfortran -lm
-
-# Atlas RHEL and Fedora
-ATLAS=/usr/lib64/atlas
-LIBATLAS = -fopenmp $(ATLAS)/liblapack.a  $(ATLAS)/libptcblas.a  $(ATLAS)/libptf77blas.a  $(ATLAS)/libatlas.a -lgfortran -lm
-
-# Intel standard
-# MKL=/opt/intel/mkl/lib/intel64
-# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm
-
-# Intel custom
-MKL=/home/saar/intel_mkl
-LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm
-
-# Apple vecLib
-LIBVECLIB = -framework Accelerate
-
-ESSL=/opt/ibm/lib
-#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
-LIBESSL = -lesslsmp  $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
-
-ifneq ($(NO_LAPACK), 1)
-GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
-		    scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
-		    sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
-		    sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
-		    csymv.goto zsymv.goto \
-		    sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
-		    spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto
-else
-GOTO_LAPACK_TARGETS=
-endif
-
-ifeq ($(BUILD_BFLOAT16),1)
-GOTO_HALF_TARGETS=sbgemm.goto
-else
-GOTO_HALF_TARGETS=
-endif
-
-ifeq ($(OSNAME), WINNT)
-
-goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
-       scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
-       sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
-       strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
-       strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
-       sspr.goto dspr.goto \
-       sspr2.goto dspr2.goto \
-       ssyr.goto dsyr.goto \
-       ssyr2.goto dsyr2.goto \
-       ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
-       ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
-       sger.goto dger.goto cger.goto zger.goto \
-       sdot.goto ddot.goto \
-       srot.goto drot.goto csrot.goto zdrot.goto \
-       srotm.goto drotm.goto \
-       saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
-       scopy.goto dcopy.goto ccopy.goto zcopy.goto \
-       sswap.goto dswap.goto cswap.goto zswap.goto \
-       sscal.goto dscal.goto cscal.goto zscal.goto \
-       sasum.goto dasum.goto casum.goto zasum.goto \
-       ssymv.goto dsymv.goto csymv.goto zsymv.goto \
-       chemv.goto zhemv.goto \
-       chbmv.goto zhbmv.goto \
-       chpmv.goto zhpmv.goto \
-       chemm.goto zhemm.goto \
-       cherk.goto zherk.goto \
-       cher2k.goto zher2k.goto \
-       cher.goto zher.goto \
-       cher2.goto zher2.goto \
-       sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
-	   sspmv.goto dspmv.goto \
-       strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \
-       stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \
-       stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \
-       strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \
-       sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
-       sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
-       sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
-       spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
-       ssymm.goto dsymm.goto csymm.goto zsymm.goto \
-       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS)
-
-acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
-       scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
-       sgemm.acml dgemm.acml cgemm.acml zgemm.acml \
-       strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \
-       strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \
-       sspr.acml dspr.acml \
-       sspr2.acml dspr2.acml \
-       ssyr.acml dsyr.acml \
-       ssyr2.acml dsyr2.acml \
-       ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
-       ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
-       sger.acml dger.acml cger.acml zger.acml \
-       sdot.acml ddot.acml \
-       srot.acml drot.acml csrot.acml zdrot.acml \
-       srotm.acml drotm.acml \
-       saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
-       scopy.acml dcopy.acml ccopy.acml zcopy.acml \
-       sswap.acml dswap.acml cswap.acml zswap.acml \
-       sscal.acml dscal.acml cscal.acml zscal.acml \
-       sasum.acml dasum.acml casum.acml zasum.acml \
-       ssymv.acml dsymv.acml csymv.acml zsymv.acml \
-       chemv.acml zhemv.acml \
-       chbmv.acml zhbmv.acml \
-       chpmv.acml zhpmv.acml \
-       chemm.acml zhemm.acml \
-       cherk.acml zherk.acml \
-       cher2k.acml zher2k.acml \
-       cher.acml zher.acml \
-       cher2.acml zher2.acml \
-       sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
-       strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \
-       stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \
-       stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \
-       strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \
-       sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
-       sgesv.acml dgesv.acml cgesv.acml zgesv.acml \
-       sgetri.acml dgetri.acml cgetri.acml zgetri.acml \
-       spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \
-       ssymm.acml dsymm.acml csymm.acml zsymm.acml \
-       saxpby.acml daxpby.acml caxpby.acml zaxpby.acml
-
-atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
-       scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \
-       sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \
-       strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \
-       strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \
-       sspr.atlas dspr.atlas \
-       sspr2.atlas dspr2.atlas \
-       ssyr.atlas dsyr.atlas \
-       ssyr2.atlas dsyr2.atlas \
-       ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
-       ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
-       sger.atlas dger.atlas cger.atlas zger.atlas\
-       sdot.atlas ddot.atlas \
-       srot.atlas drot.atlas csrot.atlas zdrot.atlas \
-       srotm.atlas drotm.atlas \
-       saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \
-       scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \
-       sswap.atlas dswap.atlas cswap.atlas zswap.atlas \
-       sscal.atlas dscal.atlas cscal.atlas zscal.atlas \
-       sasum.atlas dasum.atlas casum.atlas zasum.atlas \
-       ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \
-       chemv.atlas zhemv.atlas \
-       chbmv.atlas zhbmv.atlas \
-       chpmv.atlas zhpmv.atlas \
-       chemm.acml zhemm.acml \
-       chemm.atlas zhemm.atlas \
-       cherk.atlas zherk.atlas \
-       cher2k.atlas zher2k.atlas \
-       cher.atlas zher.atlas \
-       cher2.atlas zher2.atlas \
-       sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
-	   sspmv.atlas dspmv.atlas \
-       strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \
-       stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \
-       stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \
-       strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \
-       sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
-       sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
-       sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
-       spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
-       ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \
-       saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas
-
-mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
-       scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
-       sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \
-       strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \
-       strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \
-       sspr.mkl dspr.mkl \
-       sspr2.mkl dspr2.mkl \
-       ssyr.mkl dsyr.mkl \
-       ssyr2.mkl dsyr2.mkl \
-       ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
-       ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
-       sger.mkl dger.mkl cger.mkl zger.mkl \
-       sdot.mkl ddot.mkl \
-       srot.mkl drot.mkl csrot.mkl zdrot.mkl \
-       srotm.mkl drotm.mkl \
-       saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
-       scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \
-       sswap.mkl dswap.mkl cswap.mkl zswap.mkl \
-       sscal.mkl dscal.mkl cscal.mkl zscal.mkl \
-       sasum.mkl dasum.mkl casum.mkl zasum.mkl \
-       ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
-       chemv.mkl zhemv.mkl \
-       chbmv.mkl zhbmv.mkl \
-       chpmv.mkl zhpmv.mkl \
-       chemm.mkl zhemm.mkl \
-       cherk.mkl zherk.mkl \
-       cher2k.mkl zher2k.mkl \
-       cher.mkl zher.mkl \
-       cher2.mkl zher2.mkl \
-       sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
-       strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \
-       stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \
-       stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \
-       strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \
-       sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
-       sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \
-       sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \
-       spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \
-       ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \
-       saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl
-
-else
-
-goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
-       strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
-       strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
-       sspr.goto dspr.goto \
-       sspr2.goto dspr2.goto \
-       ssyr.goto dsyr.goto \
-       ssyr2.goto dsyr2.goto \
-       ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
-       ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
-       sger.goto dger.goto cger.goto zger.goto \
-       sdot.goto ddot.goto cdot.goto zdot.goto \
-       srot.goto drot.goto csrot.goto zdrot.goto \
-       srotm.goto drotm.goto \
-       saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
-       scopy.goto dcopy.goto ccopy.goto zcopy.goto \
-       sswap.goto dswap.goto cswap.goto zswap.goto \
-       sscal.goto dscal.goto cscal.goto zscal.goto \
-       sasum.goto dasum.goto casum.goto zasum.goto \
-       ssymv.goto dsymv.goto \
-       chemv.goto zhemv.goto \
-       chbmv.goto zhbmv.goto \
-       chpmv.goto zhpmv.goto \
-       chemm.goto zhemm.goto \
-       cherk.goto zherk.goto \
-       cher2k.goto zher2k.goto \
-       cher.goto zher.goto \
-       cher2.goto zher2.goto \
-       sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
-	   sspmv.goto dspmv.goto \
-       strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \
-       stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \
-       stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \
-       strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \
-       ssymm.goto dsymm.goto csymm.goto zsymm.goto \
-       smallscaling \
-       isamax.goto idamax.goto icamax.goto izamax.goto \
-       ismax.goto idmax.goto \
-       isamin.goto idamin.goto icamin.goto izamin.goto \
-       ismin.goto idmin.goto \
-       samax.goto damax.goto camax.goto zamax.goto \
-       smax.goto dmax.goto \
-       samin.goto damin.goto camin.goto zamin.goto \
-       smin.goto dmin.goto \
-       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
-       snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
-
-acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
-       scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
-       sgemm.acml dgemm.acml cgemm.acml zgemm.acml \
-       strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \
-       strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \
-       sspr.acml dspr.acml \
-       sspr2.acml dspr2.acml \
-       ssyr.acml dsyr.acml \
-       ssyr2.acml dsyr2.acml \
-       ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
-       ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
-       sger.acml dger.acml cger.acml zger.acml \
-       sdot.acml ddot.acml \
-       srot.acml drot.acml csrot.acml zdrot.acml \
-       srotm.acml drotm.acml \
-       saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
-       scopy.acml dcopy.acml ccopy.acml zcopy.acml \
-       sswap.acml dswap.acml cswap.acml zswap.acml \
-       sscal.acml dscal.acml cscal.acml zscal.acml \
-       sasum.acml dasum.acml casum.acml zasum.acml \
-       ssymv.acml dsymv.acml csymv.acml zsymv.acml \
-       chemv.acml zhemv.acml \
-       chbmv.acml zhbmv.acml \
-       chpmv.acml zhpmv.acml \
-       chemm.acml zhemm.acml \
-       cherk.acml zherk.acml \
-       cher2k.acml zher2k.acml \
-       cher.acml zher.acml \
-       cher2.acml zher2.acml \
-       sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
-       strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \
-       stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \
-       stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \
-       strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \
-       sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
-       sgesv.acml dgesv.acml cgesv.acml zgesv.acml \
-       sgetri.acml dgetri.acml cgetri.acml zgetri.acml \
-       spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \
-       ssymm.acml dsymm.acml csymm.acml zsymm.acml \
-       saxpby.acml daxpby.acml caxpby.acml zaxpby.acml
-
-atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
-       scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \
-       sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \
-       strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \
-       strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \
-       sspr.atlas dspr.atlas \
-       sspr2.atlas dspr2.atlas \
-       ssyr.atlas dsyr.atlas \
-       ssyr2.atlas dsyr2.atlas \
-       ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
-       ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
-       sger.atlas dger.atlas cger.atlas zger.atlas\
-       sdot.atlas ddot.atlas \
-       srot.atlas drot.atlas csrot.atlas zdrot.atlas \
-       srotm.atlas drotm.atlas \
-       saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \
-       scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \
-       sswap.atlas dswap.atlas cswap.atlas zswap.atlas \
-       sscal.atlas dscal.atlas cscal.atlas zscal.atlas \
-       sasum.atlas dasum.atlas casum.atlas zasum.atlas \
-       ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \
-       chemv.atlas zhemv.atlas \
-       chbmv.atlas zhbmv.atlas \
-       chpmv.atlas zhpmv.atlas \
-       chemm.acml zhemm.acml \
-       chemm.atlas zhemm.atlas \
-       cherk.atlas zherk.atlas \
-       cher2k.atlas zher2k.atlas \
-       cher.atlas zher.atlas \
-       cher2.atlas zher2.atlas \
-       sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
-	   sspmv.atlas dspmv.atlas \
-       strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \
-       stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \
-       stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \
-       strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \
-       sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
-       sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
-       sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
-       spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
-       ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \
-       isamax.atlas idamax.atlas icamax.atlas izamax.atlas \
-       snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \
-       saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas
-
-mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
-       scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
-       sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \
-       strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \
-       strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \
-       sspr.mkl dspr.mkl \
-       sspr2.mkl dspr2.mkl \
-       ssyr.mkl dsyr.mkl \
-       ssyr2.mkl dsyr2.mkl \
-       ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
-       ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
-       sger.mkl dger.mkl cger.mkl zger.mkl \
-       sdot.mkl ddot.mkl cdot.mkl zdot.mkl \
-       srot.atlas drot.atlas csrot.atlas zdrot.atlas \
-       srotm.atlas drotm.atlas \
-       saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
-       scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \
-       sswap.mkl dswap.mkl cswap.mkl zswap.mkl \
-       sscal.mkl dscal.mkl cscal.mkl zscal.mkl \
-       sasum.mkl dasum.mkl casum.mkl zasum.mkl \
-       ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
-       chemv.mkl zhemv.mkl \
-       chbmv.mkl zhbmv.mkl \
-       chpmv.mkl zhpmv.mkl \
-       chemm.mkl zhemm.mkl \
-       cherk.mkl zherk.mkl \
-       cher2k.mkl zher2k.mkl \
-       cher.mkl zher.mkl \
-       cher2.mkl zher2.mkl \
-       sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
-       strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \
-       stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \
-       stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \
-       strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \
-       sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
-       sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \
-       sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \
-       spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \
-       ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \
-       saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl
-
-
-
-
-endif
-
-essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl  \
-	cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl  \
-	slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \
-	scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \
-	strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl
-
-veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
-       scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
-       sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \
-       strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \
-       strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \
-       sspr.veclib dspr.veclib \
-       sspr2.veclib dspr2.veclib \
-       ssyr.veclib dsyr.veclib \
-       ssyr2.veclib dsyr2.veclib \
-       ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \
-       ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \
-       sger.veclib dger.veclib cger.veclib zger.veclib \
-       sdot.veclib ddot.veclib cdot.veclib zdot.veclib \
-       srot.veclib drot.veclib csrot.veclib zdrot.veclib \
-       srotm.veclib drotm.veclib \
-       saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \
-       scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \
-       sswap.veclib dswap.veclib cswap.veclib zswap.veclib \
-       sscal.veclib dscal.veclib cscal.veclib zscal.veclib \
-       sasum.veclib dasum.veclib casum.veclib zasum.veclib \
-       ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \
-       chemv.veclib zhemv.veclib \
-       chbmv.veclib zhbmv.veclib \
-       chpmv.veclib zhpmv.veclib \
-       chemm.veclib zhemm.veclib \
-       cherk.veclib zherk.veclib \
-       cher2k.veclib zher2k.veclib \
-       cher.veclib zher.veclib \
-       cher2.veclib zher2.veclib \
-       sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \
-       strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \
-       stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \
-       stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \
-       strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \
-       sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \
-       sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \
-       sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \
-       spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \
-       ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \
-       saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib
-
-goto_3m :: cgemm3m.goto zgemm3m.goto
-
-mkl_3m :: cgemm3m.mkl zgemm3m.mkl
-
-all :: goto mkl atlas acml veclib
-
-exe :
-	@./Make_exe.sh
-
-##################################### Slinpack ####################################################
-slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-slinpack.acml : slinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-slinpack.atlas : slinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-slinpack.mkl : slinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-slinpack.veclib : slinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-slinpack.essl : slinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dlinpack ####################################################
-dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dlinpack.acml : dlinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dlinpack.atlas : dlinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dlinpack.mkl : dlinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dlinpack.veclib : dlinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dlinpack.essl : dlinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Clinpack ####################################################
-
-clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-clinpack.acml : clinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-clinpack.atlas : clinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-clinpack.mkl : clinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-clinpack.veclib : clinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-clinpack.essl : clinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zlinpack ####################################################
-
-zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zlinpack.acml : zlinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zlinpack.atlas : zlinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zlinpack.mkl : zlinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zlinpack.veclib : zlinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zlinpack.essl : zlinpack.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Scholesky ###################################################
-
-scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-scholesky.acml : scholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-scholesky.atlas : scholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-scholesky.mkl : scholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-scholesky.veclib : scholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-scholesky.essl : scholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dcholesky ###################################################
-
-dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dcholesky.acml : dcholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dcholesky.atlas : dcholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dcholesky.mkl : dcholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dcholesky.veclib : dcholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dcholesky.essl : dcholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ccholesky ###################################################
-
-ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ccholesky.acml : ccholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ccholesky.atlas : ccholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ccholesky.mkl : ccholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ccholesky.veclib : ccholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ccholesky.essl : ccholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-
-##################################### Zcholesky ###################################################
-
-zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zcholesky.acml : zcholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zcholesky.atlas : zcholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zcholesky.mkl : zcholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zcholesky.veclib : zcholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zcholesky.essl : zcholesky.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Sgemm ####################################################
-ifeq ($(BUILD_BFLOAT16),1)
-sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-endif
-
-sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sgemm.acml : sgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgemm.atlas : sgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgemm.mkl : sgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgemm.veclib : sgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgemm.essl : sgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dgemm ####################################################
-dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dgemm.acml : dgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgemm.atlas : dgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgemm.mkl : dgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgemm.veclib : dgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgemm.essl : dgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cgemm ####################################################
-
-cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cgemm.acml : cgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgemm.atlas : cgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgemm.mkl : cgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgemm.veclib : cgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgemm.essl : cgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zgemm ####################################################
-
-zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zgemm.acml : zgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgemm.atlas : zgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgemm.mkl : zgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgemm.veclib : zgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgemm.essl : zgemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ssymm ####################################################
-ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ssymm.acml : ssymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssymm.atlas : ssymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssymm.mkl : ssymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssymm.veclib : ssymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dsymm ####################################################
-dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dsymm.acml : dsymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsymm.atlas : dsymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsymm.mkl : dsymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsymm.veclib : dsymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Csymm ####################################################
-
-csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-csymm.acml : csymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csymm.atlas : csymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csymm.mkl : csymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csymm.veclib : csymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zsymm ####################################################
-
-zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zsymm.acml : zsymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsymm.atlas : zsymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsymm.mkl : zsymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsymm.veclib : zsymm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Strmm ####################################################
-strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-strmm.acml : strmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strmm.atlas : strmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strmm.mkl : strmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strmm.veclib : strmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strmm.essl : strmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dtrmm ####################################################
-dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dtrmm.acml : dtrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrmm.atlas : dtrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrmm.mkl : dtrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrmm.veclib : dtrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrmm.essl : dtrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ctrmm ####################################################
-
-ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ctrmm.acml : ctrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrmm.atlas : ctrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrmm.mkl : ctrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrmm.veclib : ctrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrmm.essl : ctrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ztrmm ####################################################
-
-ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ztrmm.acml : ztrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrmm.atlas : ztrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrmm.mkl : ztrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrmm.veclib : ztrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrmm.essl : ztrmm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Strsm ####################################################
-strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-strsm.acml : strsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strsm.atlas : strsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strsm.mkl : strsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strsm.veclib : strsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strsm.essl : strsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dtrsm ####################################################
-dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dtrsm.acml : dtrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrsm.atlas : dtrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrsm.mkl : dtrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrsm.veclib : dtrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrsm.essl : dtrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ctrsm ####################################################
-
-ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ctrsm.acml : ctrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrsm.atlas : ctrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrsm.mkl : ctrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrsm.veclib : ctrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrsm.essl : ctrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ztrsm ####################################################
-
-ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ztrsm.acml : ztrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrsm.atlas : ztrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrsm.mkl : ztrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrsm.veclib : ztrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrsm.essl : ztrsm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-##################################### Ssyr ####################################################
-ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ssyr.acml : ssyr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyr.atlas : ssyr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyr.mkl : ssyr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyr.veclib : ssyr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-##################################### Dsyr ####################################################
-dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dsyr.acml : dsyr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyr.atlas : dsyr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyr.mkl : dsyr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyr.veclib : dsyr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-	
-##################################### Sspr ####################################################
-sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sspr.acml : sspr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sspr.atlas : sspr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sspr.mkl : sspr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sspr.veclib : sspr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-	
-##################################### Dspr ####################################################
-dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dspr.acml : dspr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dspr.atlas : dspr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dspr.mkl : dspr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dspr.veclib : dspr.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-	
-##################################### Sspr2 ####################################################
-sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sspr2.acml : sspr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sspr2.atlas : sspr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sspr2.mkl : sspr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sspr2.veclib : sspr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-	
-##################################### Dspr2 ####################################################
-dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dspr2.acml : dspr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dspr2.atlas : dspr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dspr2.mkl : dspr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dspr2.veclib : dspr2.$(SUFFIX)
-
-##################################### Ssyr2 ####################################################
-ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ssyr2.acml : ssyr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyr2.atlas : ssyr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyr2.mkl : ssyr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyr2.veclib : ssyr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-##################################### Dsyr2 ####################################################
-dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dsyr2.acml : dsyr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyr2.atlas : dsyr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyr2.mkl : dsyr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyr2.veclib : dsyr2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ssyrk ####################################################
-ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ssyrk.acml : ssyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyrk.atlas : ssyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyrk.mkl : ssyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyrk.veclib : ssyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dsyrk ####################################################
-dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dsyrk.acml : dsyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyrk.atlas : dsyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyrk.mkl : dsyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyrk.veclib : dsyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Csyrk ####################################################
-
-csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-csyrk.acml : csyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csyrk.atlas : csyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csyrk.mkl : csyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csyrk.veclib : csyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zsyrk ####################################################
-
-zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zsyrk.acml : zsyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsyrk.atlas : zsyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsyrk.mkl : zsyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsyrk.veclib : zsyrk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ssyr2k ####################################################
-ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ssyr2k.acml : ssyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyr2k.atlas : ssyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyr2k.mkl : ssyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssyr2k.veclib : ssyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dsyr2k ####################################################
-dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dsyr2k.acml : dsyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyr2k.atlas : dsyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyr2k.mkl : dsyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsyr2k.veclib : dsyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Csyr2k ####################################################
-
-csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-csyr2k.acml : csyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csyr2k.atlas : csyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csyr2k.mkl : csyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csyr2k.veclib : csyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zsyr2k ####################################################
-
-zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zsyr2k.acml : zsyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsyr2k.atlas : zsyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsyr2k.mkl : zsyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsyr2k.veclib : zsyr2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Chemm ####################################################
-
-chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-chemm.acml : chemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chemm.atlas : chemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chemm.mkl : chemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chemm.veclib : chemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zhemm ####################################################
-
-zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zhemm.acml : zhemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhemm.atlas : zhemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhemm.mkl : zhemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhemm.veclib : zhemm.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cherk ####################################################
-
-cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cherk.acml : cherk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cherk.atlas : cherk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cherk.mkl : cherk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cherk.veclib : cherk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zherk ####################################################
-
-zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zherk.acml : zherk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zherk.atlas : zherk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zherk.mkl : zherk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zherk.veclib : zherk.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cher2k ####################################################
-
-cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cher2k.acml : cher2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cher2k.atlas : cher2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cher2k.mkl : cher2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cher2k.veclib : cher2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zher2k ####################################################
-
-zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zher2k.acml : zher2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zher2k.atlas : zher2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zher2k.mkl : zher2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zher2k.veclib : zher2k.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cher ####################################################
-
-cher.goto : cher.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cher.acml : cher.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cher.atlas : cher.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cher.mkl : cher.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cher.veclib : cher.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zher ####################################################
-
-zher.goto : zher.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zher.acml : zher.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zher.atlas : zher.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zher.mkl : zher.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zher.veclib : zher.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cher2 ####################################################
-
-cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cher2.acml : cher2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cher2.atlas : cher2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cher2.mkl : cher2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cher2.veclib : cher2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zher2 ####################################################
-
-zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zher2.acml : zher2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zher2.atlas : zher2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zher2.mkl : zher2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zher2.veclib : zher2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Sgemv ####################################################
-sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sgemv.acml : sgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgemv.atlas : sgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgemv.mkl : sgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgemv.veclib : sgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dgemv ####################################################
-dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dgemv.acml : dgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgemv.atlas : dgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgemv.mkl : dgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgemv.veclib : dgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cgemv ####################################################
-
-cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cgemv.acml : cgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgemv.atlas : cgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgemv.mkl : cgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgemv.veclib : cgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zgemv ####################################################
-
-zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zgemv.acml : zgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgemv.atlas : zgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgemv.mkl : zgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgemv.veclib : zgemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Sspmv ####################################################
-sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sspmv.atlas : sspmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dspmv ####################################################
-dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dspmv.atlas : dspmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Strmv ####################################################
-strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-strmv.acml : strmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strmv.atlas : strmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strmv.mkl : strmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strmv.veclib : strmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dtrmv ####################################################
-dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dtrmv.acml : dtrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrmv.atlas : dtrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrmv.mkl : dtrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrmv.veclib : dtrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ctrmv ####################################################
-
-ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ctrmv.acml : ctrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrmv.atlas : ctrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrmv.mkl : ctrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrmv.veclib : ctrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ztrmv ####################################################
-
-ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ztrmv.acml : ztrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrmv.atlas : ztrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrmv.mkl : ztrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrmv.veclib : ztrmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-
-##################################### Stpmv ####################################################
-stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-stpmv.acml : stpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-stpmv.atlas : stpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-stpmv.mkl : stpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-stpmv.veclib : stpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dtpmv ####################################################
-dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dtpmv.acml : dtpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtpmv.atlas : dtpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtpmv.mkl : dtpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtpmv.veclib : dtpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ctpmv ####################################################
-
-ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ctpmv.acml : ctpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctpmv.atlas : ctpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctpmv.mkl : ctpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctpmv.veclib : ctpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ztpmv ####################################################
-
-ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ztpmv.acml : ztpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztpmv.atlas : ztpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztpmv.mkl : ztpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztpmv.veclib : ztpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Stpsv ####################################################
-stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-stpsv.acml : stpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-stpsv.atlas : stpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-stpsv.mkl : stpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-stpsv.veclib : stpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dtpsv ####################################################
-dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dtpsv.acml : dtpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtpsv.atlas : dtpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtpsv.mkl : dtpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtpsv.veclib : dtpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ctpsv ####################################################
-
-ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ctpsv.acml : ctpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctpsv.atlas : ctpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctpsv.mkl : ctpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctpsv.veclib : ctpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ztpsv ####################################################
-
-ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ztpsv.acml : ztpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztpsv.atlas : ztpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztpsv.mkl : ztpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztpsv.veclib : ztpsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Strsv ####################################################
-strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-strsv.acml : strsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strsv.atlas : strsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strsv.mkl : strsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-strsv.veclib : strsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dtrsv ####################################################
-dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dtrsv.acml : dtrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrsv.atlas : dtrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrsv.mkl : dtrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dtrsv.veclib : dtrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ctrsv ####################################################
-
-ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ctrsv.acml : ctrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrsv.atlas : ctrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrsv.mkl : ctrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ctrsv.veclib : ctrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ztrsv ####################################################
-
-ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ztrsv.acml : ztrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrsv.atlas : ztrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrsv.mkl : ztrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ztrsv.veclib : ztrsv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Sger ####################################################
-sger.goto : sger.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sger.acml : sger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sger.atlas : sger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sger.mkl : sger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sger.veclib : sger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dger ####################################################
-dger.goto : dger.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dger.acml : dger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dger.atlas : dger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dger.mkl : dger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dger.veclib : dger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cger ####################################################
-cger.goto : cger.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cger.acml : cger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cger.atlas : cger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cger.mkl : cger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cger.veclib : cger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zger ####################################################
-zger.goto : zger.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zger.acml : zger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zger.atlas : zger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zger.mkl : zger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zger.veclib : zger.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ssymv ####################################################
-ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ssymv.acml : ssymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssymv.atlas : ssymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssymv.mkl : ssymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ssymv.veclib : ssymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dsymv ####################################################
-dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dsymv.acml : dsymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsymv.atlas : dsymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsymv.mkl : dsymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dsymv.veclib : dsymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Csymv ####################################################
-csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-csymv.acml : csymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csymv.atlas : csymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csymv.mkl : csymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csymv.veclib : csymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dsymv ####################################################
-zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zsymv.acml : zsymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsymv.atlas : zsymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsymv.mkl : zsymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zsymv.veclib : zsymv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Sgeev ####################################################
-sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sgeev.acml : sgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgeev.atlas : sgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgeev.mkl : sgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgeev.veclib : sgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dgeev ####################################################
-dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dgeev.acml : dgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgeev.atlas : dgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgeev.mkl : dgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgeev.veclib : dgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cgeev ####################################################
-
-cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cgeev.acml : cgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgeev.atlas : cgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgeev.mkl : cgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgeev.veclib : cgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zgeev ####################################################
-
-zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zgeev.acml : zgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgeev.atlas : zgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgeev.mkl : zgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgeev.veclib : zgeev.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Sgetri ####################################################
-sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sgetri.acml : sgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgetri.atlas : sgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgetri.mkl : sgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgetri.veclib : sgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dgetri ####################################################
-dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dgetri.acml : dgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgetri.atlas : dgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgetri.mkl : dgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgetri.veclib : dgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cgetri ####################################################
-
-cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cgetri.acml : cgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgetri.atlas : cgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgetri.mkl : cgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgetri.veclib : cgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zgetri ####################################################
-
-zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zgetri.acml : zgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgetri.atlas : zgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgetri.mkl : zgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgetri.veclib : zgetri.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Spotrf ####################################################
-spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-spotrf.acml : spotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-spotrf.atlas : spotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-spotrf.mkl : spotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-spotrf.veclib : spotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dpotrf ####################################################
-dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dpotrf.acml : dpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dpotrf.atlas : dpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dpotrf.mkl : dpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dpotrf.veclib : dpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cpotrf ####################################################
-
-cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cpotrf.acml : cpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cpotrf.atlas : cpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cpotrf.mkl : cpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cpotrf.veclib : cpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zpotrf ####################################################
-
-zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zpotrf.acml : zpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zpotrf.atlas : zpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zpotrf.mkl : zpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zpotrf.veclib : zpotrf.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Chemv ####################################################
-
-chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-chemv.acml : chemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chemv.atlas : chemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chemv.mkl : chemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chemv.veclib : chemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zhemv ####################################################
-
-zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zhemv.acml : zhemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhemv.atlas : zhemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhemv.mkl : zhemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhemv.veclib : zhemv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-##################################### Chbmv ####################################################
-
-chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-chbmv.acml : chbmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chbmv.atlas : chbmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chbmv.mkl : chbmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chbmv.veclib : chbmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-##################################### Zhbmv ####################################################
-
-zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zhbmv.acml : zhbmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhbmv.atlas : zhbmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhbmv.mkl : zhbmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhbmv.veclib : zhbmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-##################################### Chpmv ####################################################
-
-chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-chpmv.acml : chpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chpmv.atlas : chpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chpmv.mkl : chpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-chpmv.veclib : chpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-##################################### Zhpmv ####################################################
-
-zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zhpmv.acml : zhpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhpmv.atlas : zhpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhpmv.mkl : zhpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zhpmv.veclib : zhpmv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-##################################### Sdot ####################################################
-sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sdot.acml : sdot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sdot.atlas : sdot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sdot.mkl : sdot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sdot.veclib : sdot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ddot ####################################################
-ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ddot.acml : ddot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ddot.atlas : ddot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ddot.mkl : ddot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ddot.veclib : ddot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cdot ####################################################
-cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cdot.acml : cdot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cdot.atlas : cdot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cdot.mkl : cdot-intel.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cdot.veclib : cdot-intel.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zdot ####################################################
-zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zdot.acml : zdot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zdot.atlas : zdot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zdot.mkl : zdot-intel.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zdot.veclib : zdot-intel.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Srot ####################################################
-srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-srot.acml : srot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-srot.atlas : srot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-srot.mkl : srot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-srot.veclib : srot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Drot ####################################################
-drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-drot.acml : drot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-drot.atlas : drot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-drot.mkl : drot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-drot.veclib : drot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### csrot ####################################################
-csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-csrot.acml : csrot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csrot.atlas : csrot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csrot.mkl : csrot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-csrot.veclib : csrot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### zdrot ####################################################
-zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zdrot.acml : zdrot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zdrot.atlas : zdrot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zdrot.mkl : zdrot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zdrot.veclib : zdrot.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-	
-##################################### srotm ####################################################
-srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-srotm.acml : srotm.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-srotm.atlas : srotm.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-srotm.mkl : srotm.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-srotm.veclib : srotm.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### drotm ####################################################
-drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-drotm.acml : drotm.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-drotm.atlas : drotm.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-drotm.mkl : drotm.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-drotm.veclib : drotm.$(SUFFIX)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Saxpy ####################################################
-saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-saxpy.acml : saxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-saxpy.atlas : saxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-saxpy.mkl : saxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-saxpy.veclib : saxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Daxpy ####################################################
-daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-daxpy.acml : daxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-daxpy.atlas : daxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-daxpy.mkl : daxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-daxpy.veclib : daxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Caxpy ####################################################
-
-caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-caxpy.acml : caxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-caxpy.atlas : caxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-caxpy.mkl : caxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-caxpy.veclib : caxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zaxpy ####################################################
-
-zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zaxpy.acml : zaxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zaxpy.atlas : zaxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zaxpy.mkl : zaxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zaxpy.veclib : zaxpy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Saxpby ####################################################
-saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-saxpby.acml : saxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-saxpby.atlas : saxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-saxpby.mkl : saxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-saxpby.veclib : saxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Daxpby ####################################################
-daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-daxpby.acml : daxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-daxpby.atlas : daxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-daxpby.mkl : daxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-daxpby.veclib : daxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Caxpby ####################################################
-
-caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-caxpby.acml : caxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-caxpby.atlas : caxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-caxpby.mkl : caxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-caxpby.veclib : caxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zaxpby ####################################################
-
-zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zaxpby.acml : zaxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zaxpby.atlas : zaxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zaxpby.mkl : zaxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zaxpby.veclib : zaxpby.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-	
-##################################### Scopy ####################################################
-scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-scopy.acml : scopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-scopy.atlas : scopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-scopy.mkl : scopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-scopy.veclib : scopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dcopy ####################################################
-dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dcopy.acml : dcopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dcopy.atlas : dcopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dcopy.mkl : dcopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dcopy.veclib : dcopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Ccopy ####################################################
-
-ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-ccopy.acml : ccopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ccopy.atlas : ccopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ccopy.mkl : ccopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-ccopy.veclib : ccopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zcopy ####################################################
-
-zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zcopy.acml : zcopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zcopy.atlas : zcopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zcopy.mkl : zcopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zcopy.veclib : zcopy.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Sscal ####################################################
-sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sscal.acml : sscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sscal.atlas : sscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sscal.mkl : sscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sscal.veclib : sscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dscal ####################################################
-dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dscal.acml : dscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dscal.atlas : dscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dscal.mkl : dscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dscal.veclib : dscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cscal ####################################################
-
-cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cscal.acml : cscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cscal.atlas : cscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cscal.mkl : cscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cscal.veclib : cscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zscal ####################################################
-
-zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zscal.acml : zscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zscal.atlas : zscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zscal.mkl : zscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zscal.veclib : zscal.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Sasum ####################################################
-sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sasum.acml : sasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sasum.atlas : sasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sasum.mkl : sasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sasum.veclib : sasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dasum ####################################################
-dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dasum.acml : dasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dasum.atlas : dasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dasum.mkl : dasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dasum.veclib : dasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Casum ####################################################
-
-casum.goto : casum.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-casum.acml : casum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-casum.atlas : casum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-casum.mkl : casum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-casum.veclib : casum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zasum ####################################################
-
-zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zasum.acml : zasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zasum.atlas : zasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zasum.mkl : zasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zasum.veclib : zasum.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Sswap ####################################################
-sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sswap.acml : sswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sswap.atlas : sswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sswap.mkl : sswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sswap.veclib : sswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dswap ####################################################
-dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dswap.acml : dswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dswap.atlas : dswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dswap.mkl : dswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dswap.veclib : dswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cswap ####################################################
-
-cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cswap.acml : cswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cswap.atlas : cswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cswap.mkl : cswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cswap.veclib : cswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zswap ####################################################
-
-zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zswap.acml : zswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zswap.atlas : zswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zswap.mkl : zswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zswap.veclib : zswap.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-
-##################################### Sgesv ####################################################
-sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-sgesv.acml : sgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgesv.atlas : sgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgesv.mkl : sgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-sgesv.veclib : sgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Dgesv ####################################################
-dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dgesv.acml : dgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgesv.atlas : dgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgesv.mkl : dgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-dgesv.veclib : dgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Cgesv ####################################################
-
-cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cgesv.acml : cgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgesv.atlas : cgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgesv.mkl : cgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgesv.veclib : cgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zgesv ####################################################
-
-zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zgesv.acml : zgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgesv.atlas : zgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgesv.mkl : zgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgesv.veclib : zgesv.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-
-##################################### Cgemm3m ####################################################
-
-cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-cgemm3m.mkl : cgemm3m.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-cgemm3m.veclib : cgemm3m.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-##################################### Zgemm3m ####################################################
-
-zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-zgemm3m.mkl : zgemm3m.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-zgemm3m.veclib : zgemm3m.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-############################################## ISAMAX ##############################################
-isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-isamax.atlas : isamax.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-############################################## IDAMAX ##############################################
-idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-idamax.atlas : idamax.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-############################################## ICAMAX ##############################################
-icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-icamax.atlas : icamax.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-############################################## IZAMAX ##############################################
-izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-izamax.atlas : izamax.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-############################################## ISMAX ##############################################
-ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## IDMAX ##############################################
-idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-	
-############################################## ISAMIN ##############################################
-isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## IDAMIN ##############################################
-idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## ICAMIN ##############################################
-icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## IZAMIN ##############################################
-izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## ISMIN ##############################################
-ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## IDMIN ##############################################
-idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## SAMAX ##############################################
-samax.goto : samax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## DAMAX ##############################################
-damax.goto : damax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## CAMAX ##############################################
-camax.goto : camax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## ZAMAX ##############################################
-zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## SMAX ##############################################
-smax.goto : smax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## DMAX ##############################################
-dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## SAMIN ##############################################
-samin.goto : samin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## DAMIN ##############################################
-damin.goto : damin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## CAMIN ##############################################
-camin.goto : camin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## ZAMIN ##############################################
-zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## SMIN ##############################################
-smin.goto : smin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## DMIN ##############################################
-dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-############################################## SNRM2 ##############################################
-snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-snrm2.atlas : snrm2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-############################################## DNRM2 ##############################################
-dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dnrm2.atlas : dnrm2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-############################################## Sscnrm2 ##############################################
-scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-scnrm2.atlas : scnrm2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-############################################## Ddznrm2 ##############################################
-dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
-
-dznrm2.atlas : dznrm2.$(SUFFIX)
-	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
-
-
-###################################################################################################
-
-slinpack.$(SUFFIX) : linpack.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dlinpack.$(SUFFIX) : linpack.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-clinpack.$(SUFFIX) : linpack.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zlinpack.$(SUFFIX) : linpack.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-scholesky.$(SUFFIX) : cholesky.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dcholesky.$(SUFFIX) : cholesky.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-ccholesky.$(SUFFIX) : cholesky.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zcholesky.$(SUFFIX) : cholesky.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-ifeq ($(BUILD_BFLOAT16),1)
-sbgemm.$(SUFFIX) : gemm.c
-	$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
-endif
-
-sgemm.$(SUFFIX) : gemm.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dgemm.$(SUFFIX) : gemm.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-cgemm.$(SUFFIX) : gemm.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zgemm.$(SUFFIX) : gemm.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-ssymm.$(SUFFIX) : symm.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dsymm.$(SUFFIX) : symm.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-csymm.$(SUFFIX) : symm.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zsymm.$(SUFFIX) : symm.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-strmm.$(SUFFIX) : trmm.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dtrmm.$(SUFFIX) : trmm.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-ctrmm.$(SUFFIX) : trmm.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-ztrmm.$(SUFFIX) : trmm.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-strsm.$(SUFFIX) : trsm.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dtrsm.$(SUFFIX) : trsm.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-ctrsm.$(SUFFIX) : trsm.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-ztrsm.$(SUFFIX) : trsm.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-ssyr.$(SUFFIX) : syr.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dsyr.$(SUFFIX) : syr.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-	
-sspr.$(SUFFIX) : spr.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dspr.$(SUFFIX) : spr.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-	
-sspr2.$(SUFFIX) : spr2.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dspr2.$(SUFFIX) : spr2.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-ssyr2.$(SUFFIX) : syr2.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dsyr2.$(SUFFIX) : syr2.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-ssyrk.$(SUFFIX) : syrk.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dsyrk.$(SUFFIX) : syrk.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-csyrk.$(SUFFIX) : syrk.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zsyrk.$(SUFFIX) : syrk.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-ssyr2k.$(SUFFIX) : syr2k.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dsyr2k.$(SUFFIX) : syr2k.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-csyr2k.$(SUFFIX) : syr2k.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zsyr2k.$(SUFFIX) : syr2k.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-chemm.$(SUFFIX) : hemm.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zhemm.$(SUFFIX) : hemm.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-cherk.$(SUFFIX) : herk.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zherk.$(SUFFIX) : herk.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-cher2k.$(SUFFIX) : her2k.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zher2k.$(SUFFIX) : her2k.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-cher.$(SUFFIX) : her.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zher.$(SUFFIX) : her.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-cher2.$(SUFFIX) : her2.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zher2.$(SUFFIX) : her2.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-sgemv.$(SUFFIX) : gemv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dgemv.$(SUFFIX) : gemv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-cgemv.$(SUFFIX) : gemv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zgemv.$(SUFFIX) : gemv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-sspmv.$(SUFFIX) : spmv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dspmv.$(SUFFIX) : spmv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-strmv.$(SUFFIX) : trmv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dtrmv.$(SUFFIX) : trmv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-ctrmv.$(SUFFIX) : trmv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-ztrmv.$(SUFFIX) : trmv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-stpmv.$(SUFFIX) : tpmv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dtpmv.$(SUFFIX) : tpmv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-ctpmv.$(SUFFIX) : tpmv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-ztpmv.$(SUFFIX) : tpmv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-stpsv.$(SUFFIX) : tpsv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dtpsv.$(SUFFIX) : tpsv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-ctpsv.$(SUFFIX) : tpsv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-ztpsv.$(SUFFIX) : tpsv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-strsv.$(SUFFIX) : trsv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dtrsv.$(SUFFIX) : trsv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-ctrsv.$(SUFFIX) : trsv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-ztrsv.$(SUFFIX) : trsv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-sger.$(SUFFIX) : ger.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dger.$(SUFFIX) : ger.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-cger.$(SUFFIX) : ger.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zger.$(SUFFIX) : ger.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-ssymv.$(SUFFIX) : symv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dsymv.$(SUFFIX) : symv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-csymv.$(SUFFIX) : symv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zsymv.$(SUFFIX) : symv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-sgeev.$(SUFFIX) : geev.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dgeev.$(SUFFIX) : geev.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-cgeev.$(SUFFIX) : geev.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zgeev.$(SUFFIX) : geev.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-sgetri.$(SUFFIX) : getri.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dgetri.$(SUFFIX) : getri.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-cgetri.$(SUFFIX) : getri.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zgetri.$(SUFFIX) : getri.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-spotrf.$(SUFFIX) : potrf.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dpotrf.$(SUFFIX) : potrf.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-cpotrf.$(SUFFIX) : potrf.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zpotrf.$(SUFFIX) : potrf.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-chemv.$(SUFFIX) : hemv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zhemv.$(SUFFIX) : hemv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-chbmv.$(SUFFIX) : hbmv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zhbmv.$(SUFFIX) : hbmv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-chpmv.$(SUFFIX) : hpmv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zhpmv.$(SUFFIX) : hpmv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-sdot.$(SUFFIX) : dot.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-ddot.$(SUFFIX) : dot.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-cdot.$(SUFFIX) : zdot.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zdot.$(SUFFIX) : zdot.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-cdot-intel.$(SUFFIX) : zdot-intel.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zdot-intel.$(SUFFIX) : zdot-intel.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-
-saxpy.$(SUFFIX) : axpy.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-daxpy.$(SUFFIX) : axpy.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-caxpy.$(SUFFIX) : axpy.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zaxpy.$(SUFFIX) : axpy.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-saxpby.$(SUFFIX) : axpby.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-daxpby.$(SUFFIX) : axpby.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-caxpby.$(SUFFIX) : axpby.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zaxpby.$(SUFFIX) : axpby.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-scopy.$(SUFFIX) : copy.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dcopy.$(SUFFIX) : copy.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-ccopy.$(SUFFIX) : copy.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zcopy.$(SUFFIX) : copy.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-sswap.$(SUFFIX) : swap.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dswap.$(SUFFIX) : swap.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-cswap.$(SUFFIX) : swap.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zswap.$(SUFFIX) : swap.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-
-sscal.$(SUFFIX) : scal.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dscal.$(SUFFIX) : scal.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-cscal.$(SUFFIX) : scal.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zscal.$(SUFFIX) : scal.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-sasum.$(SUFFIX) : asum.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dasum.$(SUFFIX) : asum.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-casum.$(SUFFIX) : asum.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zasum.$(SUFFIX) : asum.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-sgesv.$(SUFFIX) : gesv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dgesv.$(SUFFIX) : gesv.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-cgesv.$(SUFFIX) : gesv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zgesv.$(SUFFIX) : gesv.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-srot.$(SUFFIX) : rot.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-drot.$(SUFFIX) : rot.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-csrot.$(SUFFIX) : rot.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zdrot.$(SUFFIX) : rot.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-	
-srotm.$(SUFFIX) : rotm.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-drotm.$(SUFFIX) : rotm.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-
-cgemm3m.$(SUFFIX) : gemm3m.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zgemm3m.$(SUFFIX) : gemm3m.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-isamax.$(SUFFIX) : iamax.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-idamax.$(SUFFIX) : iamax.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-icamax.$(SUFFIX) : iamax.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-izamax.$(SUFFIX) : iamax.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-ismax.$(SUFFIX) : imax.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-idmax.$(SUFFIX) : imax.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-isamin.$(SUFFIX) : iamin.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-idamin.$(SUFFIX) : iamin.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-icamin.$(SUFFIX) : iamin.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-izamin.$(SUFFIX) : iamin.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-ismin.$(SUFFIX) : imin.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-idmin.$(SUFFIX) : imin.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-samax.$(SUFFIX) : amax.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX  -UDOUBLE -o $(@F) $^
-
-damax.$(SUFFIX) : amax.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX  -DDOUBLE -o $(@F) $^
-
-camax.$(SUFFIX) : amax.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX  -UDOUBLE -o $(@F) $^
-
-zamax.$(SUFFIX) : amax.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX  -DDOUBLE -o $(@F) $^
-
-
-smax.$(SUFFIX) : max.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX  -UDOUBLE -o $(@F) $^
-
-dmax.$(SUFFIX) : max.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX  -DDOUBLE -o $(@F) $^
-
-
-samin.$(SUFFIX) : amin.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-damin.$(SUFFIX) : amin.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-camin.$(SUFFIX) : amin.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-zamin.$(SUFFIX) : amin.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-smin.$(SUFFIX) : min.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dmin.$(SUFFIX) : min.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-snrm2.$(SUFFIX) : nrm2.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
-
-dnrm2.$(SUFFIX) : nrm2.c
-	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
-
-scnrm2.$(SUFFIX) : nrm2.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
-
-dznrm2.$(SUFFIX) : nrm2.c
-	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
-
-
-smallscaling: smallscaling.c ../$(LIBNAME)
-	$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread
-
-clean ::
-	@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
-
-include $(TOPDIR)/Makefile.tail
+TOPDIR	= ..
+include $(TOPDIR)/Makefile.system
+
+# ACML standard
+#ACML=/opt/acml5.3.1/gfortran64_mp/lib
+#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
+
+# ACML custom
+#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib
+#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm
+
+# ACML 6.1 custom
+ACML=/home/saar/acml6.1/gfortran64_mp/lib
+LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm
+
+
+# Atlas Ubuntu
+#ATLAS=/usr/lib/atlas-base
+#LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a  $(ATLAS)/libptcblas.a  $(ATLAS)/libptf77blas.a  $(ATLAS)/libatlas.a -lgfortran -lm
+
+# Atlas RHEL and Fedora
+ATLAS=/usr/lib64/atlas
+LIBATLAS = -fopenmp $(ATLAS)/liblapack.a  $(ATLAS)/libptcblas.a  $(ATLAS)/libptf77blas.a  $(ATLAS)/libatlas.a -lgfortran -lm
+
+# Intel standard
+# MKL=/opt/intel/mkl/lib/intel64
+# LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm
+
+# Intel custom
+MKL=/home/saar/intel_mkl
+LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm
+
+# Apple vecLib
+LIBVECLIB = -framework Accelerate
+
+ESSL=/opt/ibm/lib
+#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
+LIBESSL = -lesslsmp  $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
+
+ifneq ($(NO_LAPACK), 1)
+GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
+		    scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
+		    sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
+		    sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
+		    csymv.goto zsymv.goto \
+		    sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
+		    spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto
+else
+GOTO_LAPACK_TARGETS=
+endif
+
+ifeq ($(BUILD_BFLOAT16),1)
+GOTO_HALF_TARGETS=sbgemm.goto
+else
+GOTO_HALF_TARGETS=
+endif
+
+ifeq ($(OSNAME), WINNT)
+
+goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
+       scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
+       sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
+       strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
+       strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
+       sspr.goto dspr.goto \
+       sspr2.goto dspr2.goto \
+       ssyr.goto dsyr.goto \
+       ssyr2.goto dsyr2.goto \
+       ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
+       ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
+       sger.goto dger.goto cger.goto zger.goto \
+       sdot.goto ddot.goto \
+       srot.goto drot.goto csrot.goto zdrot.goto \
+       srotm.goto drotm.goto \
+       saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
+       scopy.goto dcopy.goto ccopy.goto zcopy.goto \
+       sswap.goto dswap.goto cswap.goto zswap.goto \
+       sscal.goto dscal.goto cscal.goto zscal.goto \
+       sasum.goto dasum.goto casum.goto zasum.goto \
+       ssymv.goto dsymv.goto csymv.goto zsymv.goto \
+       chemv.goto zhemv.goto \
+       chbmv.goto zhbmv.goto \
+       chpmv.goto zhpmv.goto \
+       chemm.goto zhemm.goto \
+       cherk.goto zherk.goto \
+       cher2k.goto zher2k.goto \
+       cher.goto zher.goto \
+       cher2.goto zher2.goto \
+       sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
+	   sspmv.goto dspmv.goto \
+       strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \
+       stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \
+       stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \
+       strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \
+       sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
+       sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
+       sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
+       spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
+       ssymm.goto dsymm.goto csymm.goto zsymm.goto \
+       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS)
+
+acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
+       scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
+       sgemm.acml dgemm.acml cgemm.acml zgemm.acml \
+       strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \
+       strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \
+       sspr.acml dspr.acml \
+       sspr2.acml dspr2.acml \
+       ssyr.acml dsyr.acml \
+       ssyr2.acml dsyr2.acml \
+       ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
+       ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
+       sger.acml dger.acml cger.acml zger.acml \
+       sdot.acml ddot.acml \
+       srot.acml drot.acml csrot.acml zdrot.acml \
+       srotm.acml drotm.acml \
+       saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
+       scopy.acml dcopy.acml ccopy.acml zcopy.acml \
+       sswap.acml dswap.acml cswap.acml zswap.acml \
+       sscal.acml dscal.acml cscal.acml zscal.acml \
+       sasum.acml dasum.acml casum.acml zasum.acml \
+       ssymv.acml dsymv.acml csymv.acml zsymv.acml \
+       chemv.acml zhemv.acml \
+       chbmv.acml zhbmv.acml \
+       chpmv.acml zhpmv.acml \
+       chemm.acml zhemm.acml \
+       cherk.acml zherk.acml \
+       cher2k.acml zher2k.acml \
+       cher.acml zher.acml \
+       cher2.acml zher2.acml \
+       sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
+       strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \
+       stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \
+       stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \
+       strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \
+       sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
+       sgesv.acml dgesv.acml cgesv.acml zgesv.acml \
+       sgetri.acml dgetri.acml cgetri.acml zgetri.acml \
+       spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \
+       ssymm.acml dsymm.acml csymm.acml zsymm.acml \
+       saxpby.acml daxpby.acml caxpby.acml zaxpby.acml
+
+atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
+       scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \
+       sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \
+       strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \
+       strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \
+       sspr.atlas dspr.atlas \
+       sspr2.atlas dspr2.atlas \
+       ssyr.atlas dsyr.atlas \
+       ssyr2.atlas dsyr2.atlas \
+       ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
+       ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
+       sger.atlas dger.atlas cger.atlas zger.atlas\
+       sdot.atlas ddot.atlas \
+       srot.atlas drot.atlas csrot.atlas zdrot.atlas \
+       srotm.atlas drotm.atlas \
+       saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \
+       scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \
+       sswap.atlas dswap.atlas cswap.atlas zswap.atlas \
+       sscal.atlas dscal.atlas cscal.atlas zscal.atlas \
+       sasum.atlas dasum.atlas casum.atlas zasum.atlas \
+       ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \
+       chemv.atlas zhemv.atlas \
+       chbmv.atlas zhbmv.atlas \
+       chpmv.atlas zhpmv.atlas \
+       chemm.acml zhemm.acml \
+       chemm.atlas zhemm.atlas \
+       cherk.atlas zherk.atlas \
+       cher2k.atlas zher2k.atlas \
+       cher.atlas zher.atlas \
+       cher2.atlas zher2.atlas \
+       sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
+	   sspmv.atlas dspmv.atlas \
+       strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \
+       stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \
+       stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \
+       strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \
+       sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
+       sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
+       sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
+       spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
+       ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \
+       saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas
+
+mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
+       scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
+       sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \
+       strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \
+       strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \
+       sspr.mkl dspr.mkl \
+       sspr2.mkl dspr2.mkl \
+       ssyr.mkl dsyr.mkl \
+       ssyr2.mkl dsyr2.mkl \
+       ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
+       ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
+       sger.mkl dger.mkl cger.mkl zger.mkl \
+       sdot.mkl ddot.mkl \
+       srot.mkl drot.mkl csrot.mkl zdrot.mkl \
+       srotm.mkl drotm.mkl \
+       saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
+       scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \
+       sswap.mkl dswap.mkl cswap.mkl zswap.mkl \
+       sscal.mkl dscal.mkl cscal.mkl zscal.mkl \
+       sasum.mkl dasum.mkl casum.mkl zasum.mkl \
+       ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
+       chemv.mkl zhemv.mkl \
+       chbmv.mkl zhbmv.mkl \
+       chpmv.mkl zhpmv.mkl \
+       chemm.mkl zhemm.mkl \
+       cherk.mkl zherk.mkl \
+       cher2k.mkl zher2k.mkl \
+       cher.mkl zher.mkl \
+       cher2.mkl zher2.mkl \
+       sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
+       strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \
+       stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \
+       stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \
+       strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \
+       sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
+       sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \
+       sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \
+       spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \
+       ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \
+       saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl
+
+else
+
+goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
+       strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
+       strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
+       sspr.goto dspr.goto \
+       sspr2.goto dspr2.goto \
+       ssyr.goto dsyr.goto \
+       ssyr2.goto dsyr2.goto \
+       ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
+       ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \
+       sger.goto dger.goto cger.goto zger.goto \
+       sdot.goto ddot.goto cdot.goto zdot.goto \
+       srot.goto drot.goto csrot.goto zdrot.goto \
+       srotm.goto drotm.goto \
+       saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \
+       scopy.goto dcopy.goto ccopy.goto zcopy.goto \
+       sswap.goto dswap.goto cswap.goto zswap.goto \
+       sscal.goto dscal.goto cscal.goto zscal.goto \
+       sasum.goto dasum.goto casum.goto zasum.goto \
+       ssymv.goto dsymv.goto \
+       chemv.goto zhemv.goto \
+       chbmv.goto zhbmv.goto \
+       chpmv.goto zhpmv.goto \
+       chemm.goto zhemm.goto \
+       cherk.goto zherk.goto \
+       cher2k.goto zher2k.goto \
+       cher.goto zher.goto \
+       cher2.goto zher2.goto \
+       sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
+	   sspmv.goto dspmv.goto \
+       strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \
+       stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \
+       stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \
+       strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \
+       ssymm.goto dsymm.goto csymm.goto zsymm.goto \
+       smallscaling \
+       isamax.goto idamax.goto icamax.goto izamax.goto \
+       ismax.goto idmax.goto \
+       isamin.goto idamin.goto icamin.goto izamin.goto \
+       ismin.goto idmin.goto \
+       samax.goto damax.goto camax.goto zamax.goto \
+       smax.goto dmax.goto \
+       samin.goto damin.goto camin.goto zamin.goto \
+       smin.goto dmin.goto \
+       saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
+       snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
+
+acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
+       scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
+       sgemm.acml dgemm.acml cgemm.acml zgemm.acml \
+       strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \
+       strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \
+       sspr.acml dspr.acml \
+       sspr2.acml dspr2.acml \
+       ssyr.acml dsyr.acml \
+       ssyr2.acml dsyr2.acml \
+       ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \
+       ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \
+       sger.acml dger.acml cger.acml zger.acml \
+       sdot.acml ddot.acml \
+       srot.acml drot.acml csrot.acml zdrot.acml \
+       srotm.acml drotm.acml \
+       saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \
+       scopy.acml dcopy.acml ccopy.acml zcopy.acml \
+       sswap.acml dswap.acml cswap.acml zswap.acml \
+       sscal.acml dscal.acml cscal.acml zscal.acml \
+       sasum.acml dasum.acml casum.acml zasum.acml \
+       ssymv.acml dsymv.acml csymv.acml zsymv.acml \
+       chemv.acml zhemv.acml \
+       chbmv.acml zhbmv.acml \
+       chpmv.acml zhpmv.acml \
+       chemm.acml zhemm.acml \
+       cherk.acml zherk.acml \
+       cher2k.acml zher2k.acml \
+       cher.acml zher.acml \
+       cher2.acml zher2.acml \
+       sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
+       strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \
+       stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \
+       stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \
+       strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \
+       sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
+       sgesv.acml dgesv.acml cgesv.acml zgesv.acml \
+       sgetri.acml dgetri.acml cgetri.acml zgetri.acml \
+       spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \
+       ssymm.acml dsymm.acml csymm.acml zsymm.acml \
+       saxpby.acml daxpby.acml caxpby.acml zaxpby.acml
+
+atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
+       scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \
+       sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \
+       strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \
+       strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \
+       sspr.atlas dspr.atlas \
+       sspr2.atlas dspr2.atlas \
+       ssyr.atlas dsyr.atlas \
+       ssyr2.atlas dsyr2.atlas \
+       ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \
+       ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \
+       sger.atlas dger.atlas cger.atlas zger.atlas\
+       sdot.atlas ddot.atlas \
+       srot.atlas drot.atlas csrot.atlas zdrot.atlas \
+       srotm.atlas drotm.atlas \
+       saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \
+       scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \
+       sswap.atlas dswap.atlas cswap.atlas zswap.atlas \
+       sscal.atlas dscal.atlas cscal.atlas zscal.atlas \
+       sasum.atlas dasum.atlas casum.atlas zasum.atlas \
+       ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \
+       chemv.atlas zhemv.atlas \
+       chbmv.atlas zhbmv.atlas \
+       chpmv.atlas zhpmv.atlas \
+       chemm.acml zhemm.acml \
+       chemm.atlas zhemm.atlas \
+       cherk.atlas zherk.atlas \
+       cher2k.atlas zher2k.atlas \
+       cher.atlas zher.atlas \
+       cher2.atlas zher2.atlas \
+       sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
+	   sspmv.atlas dspmv.atlas \
+       strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \
+       stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \
+       stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \
+       strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \
+       sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
+       sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
+       sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
+       spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
+       ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \
+       isamax.atlas idamax.atlas icamax.atlas izamax.atlas \
+       snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \
+       saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas
+
+mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
+       scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
+       sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \
+       strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \
+       strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \
+       sspr.mkl dspr.mkl \
+       sspr2.mkl dspr2.mkl \
+       ssyr.mkl dsyr.mkl \
+       ssyr2.mkl dsyr2.mkl \
+       ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \
+       ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \
+       sger.mkl dger.mkl cger.mkl zger.mkl \
+       sdot.mkl ddot.mkl cdot.mkl zdot.mkl \
+       srot.atlas drot.atlas csrot.atlas zdrot.atlas \
+       srotm.atlas drotm.atlas \
+       saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \
+       scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \
+       sswap.mkl dswap.mkl cswap.mkl zswap.mkl \
+       sscal.mkl dscal.mkl cscal.mkl zscal.mkl \
+       sasum.mkl dasum.mkl casum.mkl zasum.mkl \
+       ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \
+       chemv.mkl zhemv.mkl \
+       chbmv.mkl zhbmv.mkl \
+       chpmv.mkl zhpmv.mkl \
+       chemm.mkl zhemm.mkl \
+       cherk.mkl zherk.mkl \
+       cher2k.mkl zher2k.mkl \
+       cher.mkl zher.mkl \
+       cher2.mkl zher2.mkl \
+       sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
+       strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \
+       stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \
+       stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \
+       strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \
+       sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
+       sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \
+       sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \
+       spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \
+       ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \
+       saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl
+
+
+
+
+endif
+
+essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl  \
+	cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl  \
+	slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \
+	scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \
+	strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl
+
+veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
+       scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
+       sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \
+       strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \
+       strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \
+       sspr.veclib dspr.veclib \
+       sspr2.veclib dspr2.veclib \
+       ssyr.veclib dsyr.veclib \
+       ssyr2.veclib dsyr2.veclib \
+       ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \
+       ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \
+       sger.veclib dger.veclib cger.veclib zger.veclib \
+       sdot.veclib ddot.veclib cdot.veclib zdot.veclib \
+       srot.veclib drot.veclib csrot.veclib zdrot.veclib \
+       srotm.veclib drotm.veclib \
+       saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \
+       scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \
+       sswap.veclib dswap.veclib cswap.veclib zswap.veclib \
+       sscal.veclib dscal.veclib cscal.veclib zscal.veclib \
+       sasum.veclib dasum.veclib casum.veclib zasum.veclib \
+       ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \
+       chemv.veclib zhemv.veclib \
+       chbmv.veclib zhbmv.veclib \
+       chpmv.veclib zhpmv.veclib \
+       chemm.veclib zhemm.veclib \
+       cherk.veclib zherk.veclib \
+       cher2k.veclib zher2k.veclib \
+       cher.veclib zher.veclib \
+       cher2.veclib zher2.veclib \
+       sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \
+       strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \
+       stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \
+       stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \
+       strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \
+       sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \
+       sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \
+       sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \
+       spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \
+       ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \
+       saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib
+
+goto_3m :: cgemm3m.goto zgemm3m.goto
+
+mkl_3m :: cgemm3m.mkl zgemm3m.mkl
+
+all :: goto mkl atlas acml veclib
+
+exe :
+	@./Make_exe.sh
+
+##################################### Slinpack ####################################################
+slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+slinpack.acml : slinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+slinpack.atlas : slinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+slinpack.mkl : slinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+slinpack.veclib : slinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+slinpack.essl : slinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dlinpack ####################################################
+dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dlinpack.acml : dlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dlinpack.atlas : dlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dlinpack.mkl : dlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dlinpack.veclib : dlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dlinpack.essl : dlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Clinpack ####################################################
+
+clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+clinpack.acml : clinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+clinpack.atlas : clinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+clinpack.mkl : clinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+clinpack.veclib : clinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+clinpack.essl : clinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zlinpack ####################################################
+
+zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zlinpack.acml : zlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zlinpack.atlas : zlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zlinpack.mkl : zlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zlinpack.veclib : zlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zlinpack.essl : zlinpack.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Scholesky ###################################################
+
+scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+scholesky.acml : scholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+scholesky.atlas : scholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+scholesky.mkl : scholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+scholesky.veclib : scholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+scholesky.essl : scholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dcholesky ###################################################
+
+dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dcholesky.acml : dcholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dcholesky.atlas : dcholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dcholesky.mkl : dcholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dcholesky.veclib : dcholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dcholesky.essl : dcholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ccholesky ###################################################
+
+ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ccholesky.acml : ccholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ccholesky.atlas : ccholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ccholesky.mkl : ccholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ccholesky.veclib : ccholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ccholesky.essl : ccholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
+##################################### Zcholesky ###################################################
+
+zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zcholesky.acml : zcholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zcholesky.atlas : zcholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zcholesky.mkl : zcholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zcholesky.veclib : zcholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zcholesky.essl : zcholesky.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Sgemm ####################################################
+ifeq ($(BUILD_BFLOAT16),1)
+sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+endif
+
+sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sgemm.acml : sgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgemm.atlas : sgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgemm.mkl : sgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgemm.veclib : sgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgemm.essl : sgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dgemm ####################################################
+dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dgemm.acml : dgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgemm.atlas : dgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgemm.mkl : dgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgemm.veclib : dgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgemm.essl : dgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cgemm ####################################################
+
+cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cgemm.acml : cgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemm.atlas : cgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemm.mkl : cgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemm.veclib : cgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemm.essl : cgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgemm ####################################################
+
+zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zgemm.acml : zgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemm.atlas : zgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemm.mkl : zgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemm.veclib : zgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemm.essl : zgemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ssymm ####################################################
+ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ssymm.acml : ssymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssymm.atlas : ssymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssymm.mkl : ssymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssymm.veclib : ssymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dsymm ####################################################
+dsymm.goto : dsymm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dsymm.acml : dsymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsymm.atlas : dsymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsymm.mkl : dsymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsymm.veclib : dsymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Csymm ####################################################
+
+csymm.goto : csymm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+csymm.acml : csymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csymm.atlas : csymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csymm.mkl : csymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csymm.veclib : csymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zsymm ####################################################
+
+zsymm.goto : zsymm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zsymm.acml : zsymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsymm.atlas : zsymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsymm.mkl : zsymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsymm.veclib : zsymm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Strmm ####################################################
+strmm.goto : strmm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+strmm.acml : strmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strmm.atlas : strmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strmm.mkl : strmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strmm.veclib : strmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strmm.essl : strmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dtrmm ####################################################
+dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dtrmm.acml : dtrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrmm.atlas : dtrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrmm.mkl : dtrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrmm.veclib : dtrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrmm.essl : dtrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ctrmm ####################################################
+
+ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ctrmm.acml : ctrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrmm.atlas : ctrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrmm.mkl : ctrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrmm.veclib : ctrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrmm.essl : ctrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ztrmm ####################################################
+
+ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ztrmm.acml : ztrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrmm.atlas : ztrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrmm.mkl : ztrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrmm.veclib : ztrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrmm.essl : ztrmm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Strsm ####################################################
+strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+strsm.acml : strsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strsm.atlas : strsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strsm.mkl : strsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strsm.veclib : strsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strsm.essl : strsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dtrsm ####################################################
+dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dtrsm.acml : dtrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrsm.atlas : dtrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrsm.mkl : dtrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrsm.veclib : dtrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrsm.essl : dtrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ctrsm ####################################################
+
+ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ctrsm.acml : ctrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrsm.atlas : ctrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrsm.mkl : ctrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrsm.veclib : ctrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrsm.essl : ctrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ztrsm ####################################################
+
+ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ztrsm.acml : ztrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrsm.atlas : ztrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrsm.mkl : ztrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrsm.veclib : ztrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrsm.essl : ztrsm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Ssyr ####################################################
+ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ssyr.acml : ssyr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr.atlas : ssyr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr.mkl : ssyr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr.veclib : ssyr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Dsyr ####################################################
+dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dsyr.acml : dsyr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr.atlas : dsyr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr.mkl : dsyr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr.veclib : dsyr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+	
+##################################### Sspr ####################################################
+sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sspr.acml : sspr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sspr.atlas : sspr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sspr.mkl : sspr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sspr.veclib : sspr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+	
+##################################### Dspr ####################################################
+dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dspr.acml : dspr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dspr.atlas : dspr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dspr.mkl : dspr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dspr.veclib : dspr.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+	
+##################################### Sspr2 ####################################################
+sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sspr2.acml : sspr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sspr2.atlas : sspr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sspr2.mkl : sspr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sspr2.veclib : sspr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+	
+##################################### Dspr2 ####################################################
+dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dspr2.acml : dspr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dspr2.atlas : dspr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dspr2.mkl : dspr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dspr2.veclib : dspr2.$(SUFFIX)
+
+##################################### Ssyr2 ####################################################
+ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ssyr2.acml : ssyr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr2.atlas : ssyr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr2.mkl : ssyr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr2.veclib : ssyr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Dsyr2 ####################################################
+dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dsyr2.acml : dsyr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr2.atlas : dsyr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr2.mkl : dsyr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr2.veclib : dsyr2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ssyrk ####################################################
+ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ssyrk.acml : ssyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyrk.atlas : ssyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyrk.mkl : ssyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyrk.veclib : ssyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dsyrk ####################################################
+dsyrk.goto : dsyrk.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dsyrk.acml : dsyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyrk.atlas : dsyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyrk.mkl : dsyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyrk.veclib : dsyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Csyrk ####################################################
+
+csyrk.goto : csyrk.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+csyrk.acml : csyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csyrk.atlas : csyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csyrk.mkl : csyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csyrk.veclib : csyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zsyrk ####################################################
+
+zsyrk.goto : zsyrk.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zsyrk.acml : zsyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsyrk.atlas : zsyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsyrk.mkl : zsyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsyrk.veclib : zsyrk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ssyr2k ####################################################
+ssyr2k.goto : ssyr2k.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ssyr2k.acml : ssyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr2k.atlas : ssyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr2k.mkl : ssyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssyr2k.veclib : ssyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dsyr2k ####################################################
+dsyr2k.goto : dsyr2k.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dsyr2k.acml : dsyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr2k.atlas : dsyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr2k.mkl : dsyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsyr2k.veclib : dsyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Csyr2k ####################################################
+
+csyr2k.goto : csyr2k.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+csyr2k.acml : csyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csyr2k.atlas : csyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csyr2k.mkl : csyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csyr2k.veclib : csyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zsyr2k ####################################################
+
+zsyr2k.goto : zsyr2k.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zsyr2k.acml : zsyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsyr2k.atlas : zsyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsyr2k.mkl : zsyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsyr2k.veclib : zsyr2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Chemm ####################################################
+
+chemm.goto : chemm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+chemm.acml : chemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chemm.atlas : chemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chemm.mkl : chemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chemm.veclib : chemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zhemm ####################################################
+
+zhemm.goto : zhemm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zhemm.acml : zhemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhemm.atlas : zhemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhemm.mkl : zhemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhemm.veclib : zhemm.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cherk ####################################################
+
+cherk.goto : cherk.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cherk.acml : cherk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cherk.atlas : cherk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cherk.mkl : cherk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cherk.veclib : cherk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zherk ####################################################
+
+zherk.goto : zherk.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zherk.acml : zherk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zherk.atlas : zherk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zherk.mkl : zherk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zherk.veclib : zherk.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cher2k ####################################################
+
+cher2k.goto : cher2k.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cher2k.acml : cher2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher2k.atlas : cher2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher2k.mkl : cher2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher2k.veclib : cher2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zher2k ####################################################
+
+zher2k.goto : zher2k.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zher2k.acml : zher2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher2k.atlas : zher2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher2k.mkl : zher2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher2k.veclib : zher2k.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cher ####################################################
+
+cher.goto : cher.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cher.acml : cher.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher.atlas : cher.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher.mkl : cher.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher.veclib : cher.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zher ####################################################
+
+zher.goto : zher.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zher.acml : zher.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher.atlas : zher.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher.mkl : zher.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher.veclib : zher.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cher2 ####################################################
+
+cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cher2.acml : cher2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher2.atlas : cher2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher2.mkl : cher2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cher2.veclib : cher2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zher2 ####################################################
+
+zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zher2.acml : zher2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher2.atlas : zher2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher2.mkl : zher2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zher2.veclib : zher2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Sgemv ####################################################
+sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sgemv.acml : sgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgemv.atlas : sgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgemv.mkl : sgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgemv.veclib : sgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dgemv ####################################################
+dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dgemv.acml : dgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgemv.atlas : dgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgemv.mkl : dgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgemv.veclib : dgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cgemv ####################################################
+
+cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cgemv.acml : cgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemv.atlas : cgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemv.mkl : cgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemv.veclib : cgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgemv ####################################################
+
+zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zgemv.acml : zgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemv.atlas : zgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemv.mkl : zgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemv.veclib : zgemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Sspmv ####################################################
+sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sspmv.atlas : sspmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dspmv ####################################################
+dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dspmv.atlas : dspmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Strmv ####################################################
+strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+strmv.acml : strmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strmv.atlas : strmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strmv.mkl : strmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strmv.veclib : strmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dtrmv ####################################################
+dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dtrmv.acml : dtrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrmv.atlas : dtrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrmv.mkl : dtrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrmv.veclib : dtrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ctrmv ####################################################
+
+ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ctrmv.acml : ctrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrmv.atlas : ctrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrmv.mkl : ctrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrmv.veclib : ctrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ztrmv ####################################################
+
+ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ztrmv.acml : ztrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrmv.atlas : ztrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrmv.mkl : ztrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrmv.veclib : ztrmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
+##################################### Stpmv ####################################################
+stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+stpmv.acml : stpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+stpmv.atlas : stpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+stpmv.mkl : stpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+stpmv.veclib : stpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dtpmv ####################################################
+dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dtpmv.acml : dtpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtpmv.atlas : dtpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtpmv.mkl : dtpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtpmv.veclib : dtpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ctpmv ####################################################
+
+ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ctpmv.acml : ctpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctpmv.atlas : ctpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctpmv.mkl : ctpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctpmv.veclib : ctpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ztpmv ####################################################
+
+ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ztpmv.acml : ztpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztpmv.atlas : ztpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztpmv.mkl : ztpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztpmv.veclib : ztpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Stpsv ####################################################
+stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+stpsv.acml : stpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+stpsv.atlas : stpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+stpsv.mkl : stpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+stpsv.veclib : stpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dtpsv ####################################################
+dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dtpsv.acml : dtpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtpsv.atlas : dtpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtpsv.mkl : dtpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtpsv.veclib : dtpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ctpsv ####################################################
+
+ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ctpsv.acml : ctpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctpsv.atlas : ctpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctpsv.mkl : ctpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctpsv.veclib : ctpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ztpsv ####################################################
+
+ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ztpsv.acml : ztpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztpsv.atlas : ztpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztpsv.mkl : ztpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztpsv.veclib : ztpsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Strsv ####################################################
+strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+strsv.acml : strsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strsv.atlas : strsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strsv.mkl : strsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+strsv.veclib : strsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dtrsv ####################################################
+dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dtrsv.acml : dtrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrsv.atlas : dtrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrsv.mkl : dtrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dtrsv.veclib : dtrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ctrsv ####################################################
+
+ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ctrsv.acml : ctrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrsv.atlas : ctrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrsv.mkl : ctrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ctrsv.veclib : ctrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ztrsv ####################################################
+
+ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ztrsv.acml : ztrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrsv.atlas : ztrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrsv.mkl : ztrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ztrsv.veclib : ztrsv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Sger ####################################################
+sger.goto : sger.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sger.acml : sger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sger.atlas : sger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sger.mkl : sger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sger.veclib : sger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dger ####################################################
+dger.goto : dger.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dger.acml : dger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dger.atlas : dger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dger.mkl : dger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dger.veclib : dger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cger ####################################################
+cger.goto : cger.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cger.acml : cger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cger.atlas : cger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cger.mkl : cger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cger.veclib : cger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zger ####################################################
+zger.goto : zger.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zger.acml : zger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zger.atlas : zger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zger.mkl : zger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zger.veclib : zger.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ssymv ####################################################
+ssymv.goto : ssymv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ssymv.acml : ssymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssymv.atlas : ssymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssymv.mkl : ssymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ssymv.veclib : ssymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dsymv ####################################################
+dsymv.goto : dsymv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dsymv.acml : dsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsymv.atlas : dsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsymv.mkl : dsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dsymv.veclib : dsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Csymv ####################################################
+csymv.goto : csymv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+csymv.acml : csymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csymv.atlas : csymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csymv.mkl : csymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csymv.veclib : csymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dsymv ####################################################
+zsymv.goto : zsymv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zsymv.acml : zsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsymv.atlas : zsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsymv.mkl : zsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zsymv.veclib : zsymv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Sgeev ####################################################
+sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sgeev.acml : sgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgeev.atlas : sgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgeev.mkl : sgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgeev.veclib : sgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dgeev ####################################################
+dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dgeev.acml : dgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgeev.atlas : dgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgeev.mkl : dgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgeev.veclib : dgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cgeev ####################################################
+
+cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cgeev.acml : cgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgeev.atlas : cgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgeev.mkl : cgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgeev.veclib : cgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgeev ####################################################
+
+zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zgeev.acml : zgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgeev.atlas : zgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgeev.mkl : zgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgeev.veclib : zgeev.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Sgetri ####################################################
+sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sgetri.acml : sgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgetri.atlas : sgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgetri.mkl : sgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgetri.veclib : sgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dgetri ####################################################
+dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dgetri.acml : dgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgetri.atlas : dgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgetri.mkl : dgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgetri.veclib : dgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cgetri ####################################################
+
+cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cgetri.acml : cgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgetri.atlas : cgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgetri.mkl : cgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgetri.veclib : cgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgetri ####################################################
+
+zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zgetri.acml : zgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgetri.atlas : zgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgetri.mkl : zgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgetri.veclib : zgetri.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Spotrf ####################################################
+spotrf.goto : spotrf.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+spotrf.acml : spotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+spotrf.atlas : spotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+spotrf.mkl : spotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+spotrf.veclib : spotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dpotrf ####################################################
+dpotrf.goto : dpotrf.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dpotrf.acml : dpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dpotrf.atlas : dpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dpotrf.mkl : dpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dpotrf.veclib : dpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cpotrf ####################################################
+
+cpotrf.goto : cpotrf.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cpotrf.acml : cpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cpotrf.atlas : cpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cpotrf.mkl : cpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cpotrf.veclib : cpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zpotrf ####################################################
+
+zpotrf.goto : zpotrf.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zpotrf.acml : zpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zpotrf.atlas : zpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zpotrf.mkl : zpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zpotrf.veclib : zpotrf.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Chemv ####################################################
+
+chemv.goto : chemv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+chemv.acml : chemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chemv.atlas : chemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chemv.mkl : chemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chemv.veclib : chemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zhemv ####################################################
+
+zhemv.goto : zhemv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zhemv.acml : zhemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhemv.atlas : zhemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhemv.mkl : zhemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhemv.veclib : zhemv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Chbmv ####################################################
+
+chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+chbmv.acml : chbmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chbmv.atlas : chbmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chbmv.mkl : chbmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chbmv.veclib : chbmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Zhbmv ####################################################
+
+zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zhbmv.acml : zhbmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhbmv.atlas : zhbmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhbmv.mkl : zhbmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhbmv.veclib : zhbmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Chpmv ####################################################
+
+chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+chpmv.acml : chpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chpmv.atlas : chpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chpmv.mkl : chpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+chpmv.veclib : chpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Zhpmv ####################################################
+
+zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zhpmv.acml : zhpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhpmv.atlas : zhpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhpmv.mkl : zhpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zhpmv.veclib : zhpmv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+##################################### Sdot ####################################################
+sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sdot.acml : sdot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sdot.atlas : sdot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sdot.mkl : sdot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sdot.veclib : sdot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ddot ####################################################
+ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ddot.acml : ddot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ddot.atlas : ddot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ddot.mkl : ddot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ddot.veclib : ddot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cdot ####################################################
+cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cdot.acml : cdot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cdot.atlas : cdot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cdot.mkl : cdot-intel.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cdot.veclib : cdot-intel.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zdot ####################################################
+zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zdot.acml : zdot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zdot.atlas : zdot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zdot.mkl : zdot-intel.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zdot.veclib : zdot-intel.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Srot ####################################################
+srot.goto : srot.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+srot.acml : srot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srot.atlas : srot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srot.mkl : srot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srot.veclib : srot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Drot ####################################################
+drot.goto : drot.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+drot.acml : drot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drot.atlas : drot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drot.mkl : drot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drot.veclib : drot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### csrot ####################################################
+csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+csrot.acml : csrot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csrot.atlas : csrot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csrot.mkl : csrot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+csrot.veclib : csrot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### zdrot ####################################################
+zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zdrot.acml : zdrot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zdrot.atlas : zdrot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zdrot.mkl : zdrot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zdrot.veclib : zdrot.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+	
+##################################### srotm ####################################################
+srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+srotm.acml : srotm.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srotm.atlas : srotm.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srotm.mkl : srotm.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+srotm.veclib : srotm.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### drotm ####################################################
+drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+drotm.acml : drotm.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drotm.atlas : drotm.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drotm.mkl : drotm.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+drotm.veclib : drotm.$(SUFFIX)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Saxpy ####################################################
+saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+saxpy.acml : saxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+saxpy.atlas : saxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+saxpy.mkl : saxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+saxpy.veclib : saxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Daxpy ####################################################
+daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+daxpy.acml : daxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+daxpy.atlas : daxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+daxpy.mkl : daxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+daxpy.veclib : daxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Caxpy ####################################################
+
+caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+caxpy.acml : caxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+caxpy.atlas : caxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+caxpy.mkl : caxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+caxpy.veclib : caxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zaxpy ####################################################
+
+zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zaxpy.acml : zaxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zaxpy.atlas : zaxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zaxpy.mkl : zaxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zaxpy.veclib : zaxpy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Saxpby ####################################################
+saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+saxpby.acml : saxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+saxpby.atlas : saxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+saxpby.mkl : saxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+saxpby.veclib : saxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Daxpby ####################################################
+daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+daxpby.acml : daxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+daxpby.atlas : daxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+daxpby.mkl : daxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+daxpby.veclib : daxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Caxpby ####################################################
+
+caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+caxpby.acml : caxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+caxpby.atlas : caxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+caxpby.mkl : caxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+caxpby.veclib : caxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zaxpby ####################################################
+
+zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zaxpby.acml : zaxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zaxpby.atlas : zaxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zaxpby.mkl : zaxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zaxpby.veclib : zaxpby.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+	
+##################################### Scopy ####################################################
+scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+scopy.acml : scopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+scopy.atlas : scopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+scopy.mkl : scopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+scopy.veclib : scopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dcopy ####################################################
+dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dcopy.acml : dcopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dcopy.atlas : dcopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dcopy.mkl : dcopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dcopy.veclib : dcopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Ccopy ####################################################
+
+ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+ccopy.acml : ccopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ccopy.atlas : ccopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ccopy.mkl : ccopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+ccopy.veclib : ccopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zcopy ####################################################
+
+zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zcopy.acml : zcopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zcopy.atlas : zcopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zcopy.mkl : zcopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zcopy.veclib : zcopy.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Sscal ####################################################
+sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sscal.acml : sscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sscal.atlas : sscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sscal.mkl : sscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sscal.veclib : sscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dscal ####################################################
+dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dscal.acml : dscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dscal.atlas : dscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dscal.mkl : dscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dscal.veclib : dscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cscal ####################################################
+
+cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cscal.acml : cscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cscal.atlas : cscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cscal.mkl : cscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cscal.veclib : cscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zscal ####################################################
+
+zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zscal.acml : zscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zscal.atlas : zscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zscal.mkl : zscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zscal.veclib : zscal.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Sasum ####################################################
+sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sasum.acml : sasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sasum.atlas : sasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sasum.mkl : sasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sasum.veclib : sasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dasum ####################################################
+dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dasum.acml : dasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dasum.atlas : dasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dasum.mkl : dasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dasum.veclib : dasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Casum ####################################################
+
+casum.goto : casum.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+casum.acml : casum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+casum.atlas : casum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+casum.mkl : casum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+casum.veclib : casum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zasum ####################################################
+
+zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zasum.acml : zasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zasum.atlas : zasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zasum.mkl : zasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zasum.veclib : zasum.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Sswap ####################################################
+sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sswap.acml : sswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sswap.atlas : sswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sswap.mkl : sswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sswap.veclib : sswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dswap ####################################################
+dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dswap.acml : dswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dswap.atlas : dswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dswap.mkl : dswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dswap.veclib : dswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cswap ####################################################
+
+cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cswap.acml : cswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cswap.atlas : cswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cswap.mkl : cswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cswap.veclib : cswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zswap ####################################################
+
+zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zswap.acml : zswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zswap.atlas : zswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zswap.mkl : zswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zswap.veclib : zswap.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
+##################################### Sgesv ####################################################
+sgesv.goto : sgesv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+sgesv.acml : sgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgesv.atlas : sgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgesv.mkl : sgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+sgesv.veclib : sgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Dgesv ####################################################
+dgesv.goto : dgesv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dgesv.acml : dgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgesv.atlas : dgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgesv.mkl : dgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+dgesv.veclib : dgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Cgesv ####################################################
+
+cgesv.goto : cgesv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cgesv.acml : cgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgesv.atlas : cgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgesv.mkl : cgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgesv.veclib : cgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgesv ####################################################
+
+zgesv.goto : zgesv.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zgesv.acml : zgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgesv.atlas : zgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgesv.mkl : zgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgesv.veclib : zgesv.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
+##################################### Cgemm3m ####################################################
+
+cgemm3m.goto : cgemm3m.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+cgemm3m.mkl : cgemm3m.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+cgemm3m.veclib : cgemm3m.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+##################################### Zgemm3m ####################################################
+
+zgemm3m.goto : zgemm3m.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+zgemm3m.mkl : zgemm3m.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+zgemm3m.veclib : zgemm3m.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## ISAMAX ##############################################
+isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+isamax.atlas : isamax.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## IDAMAX ##############################################
+idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+idamax.atlas : idamax.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## ICAMAX ##############################################
+icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+icamax.atlas : icamax.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## IZAMAX ##############################################
+izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+izamax.atlas : izamax.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## ISMAX ##############################################
+ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## IDMAX ##############################################
+idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+	
+############################################## ISAMIN ##############################################
+isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## IDAMIN ##############################################
+idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## ICAMIN ##############################################
+icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## IZAMIN ##############################################
+izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## ISMIN ##############################################
+ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## IDMIN ##############################################
+idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## SAMAX ##############################################
+samax.goto : samax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## DAMAX ##############################################
+damax.goto : damax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## CAMAX ##############################################
+camax.goto : camax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## ZAMAX ##############################################
+zamax.goto : zamax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## SMAX ##############################################
+smax.goto : smax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## DMAX ##############################################
+dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## SAMIN ##############################################
+samin.goto : samin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## DAMIN ##############################################
+damin.goto : damin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## CAMIN ##############################################
+camin.goto : camin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## ZAMIN ##############################################
+zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## SMIN ##############################################
+smin.goto : smin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## DMIN ##############################################
+dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+############################################## SNRM2 ##############################################
+snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+snrm2.atlas : snrm2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## DNRM2 ##############################################
+dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dnrm2.atlas : dnrm2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## Sscnrm2 ##############################################
+scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+scnrm2.atlas : scnrm2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+############################################## Ddznrm2 ##############################################
+dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
+
+dznrm2.atlas : dznrm2.$(SUFFIX)
+	-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
+
+
+###################################################################################################
+
+slinpack.$(SUFFIX) : linpack.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dlinpack.$(SUFFIX) : linpack.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+clinpack.$(SUFFIX) : linpack.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zlinpack.$(SUFFIX) : linpack.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+scholesky.$(SUFFIX) : cholesky.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dcholesky.$(SUFFIX) : cholesky.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ccholesky.$(SUFFIX) : cholesky.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zcholesky.$(SUFFIX) : cholesky.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+ifeq ($(BUILD_BFLOAT16),1)
+sbgemm.$(SUFFIX) : gemm.c
+	$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
+endif
+
+sgemm.$(SUFFIX) : gemm.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dgemm.$(SUFFIX) : gemm.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cgemm.$(SUFFIX) : gemm.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgemm.$(SUFFIX) : gemm.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+ssymm.$(SUFFIX) : symm.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dsymm.$(SUFFIX) : symm.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+csymm.$(SUFFIX) : symm.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zsymm.$(SUFFIX) : symm.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+strmm.$(SUFFIX) : trmm.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dtrmm.$(SUFFIX) : trmm.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ctrmm.$(SUFFIX) : trmm.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+ztrmm.$(SUFFIX) : trmm.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+strsm.$(SUFFIX) : trsm.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dtrsm.$(SUFFIX) : trsm.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ctrsm.$(SUFFIX) : trsm.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+ztrsm.$(SUFFIX) : trsm.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+ssyr.$(SUFFIX) : syr.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dsyr.$(SUFFIX) : syr.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+	
+sspr.$(SUFFIX) : spr.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dspr.$(SUFFIX) : spr.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+	
+sspr2.$(SUFFIX) : spr2.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dspr2.$(SUFFIX) : spr2.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ssyr2.$(SUFFIX) : syr2.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dsyr2.$(SUFFIX) : syr2.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ssyrk.$(SUFFIX) : syrk.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dsyrk.$(SUFFIX) : syrk.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+csyrk.$(SUFFIX) : syrk.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zsyrk.$(SUFFIX) : syrk.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+ssyr2k.$(SUFFIX) : syr2k.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dsyr2k.$(SUFFIX) : syr2k.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+csyr2k.$(SUFFIX) : syr2k.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zsyr2k.$(SUFFIX) : syr2k.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+chemm.$(SUFFIX) : hemm.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zhemm.$(SUFFIX) : hemm.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+cherk.$(SUFFIX) : herk.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zherk.$(SUFFIX) : herk.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+cher2k.$(SUFFIX) : her2k.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zher2k.$(SUFFIX) : her2k.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+cher.$(SUFFIX) : her.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zher.$(SUFFIX) : her.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+cher2.$(SUFFIX) : her2.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zher2.$(SUFFIX) : her2.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+sgemv.$(SUFFIX) : gemv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dgemv.$(SUFFIX) : gemv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cgemv.$(SUFFIX) : gemv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgemv.$(SUFFIX) : gemv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+sspmv.$(SUFFIX) : spmv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dspmv.$(SUFFIX) : spmv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+strmv.$(SUFFIX) : trmv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dtrmv.$(SUFFIX) : trmv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ctrmv.$(SUFFIX) : trmv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+ztrmv.$(SUFFIX) : trmv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+stpmv.$(SUFFIX) : tpmv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dtpmv.$(SUFFIX) : tpmv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ctpmv.$(SUFFIX) : tpmv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+ztpmv.$(SUFFIX) : tpmv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+stpsv.$(SUFFIX) : tpsv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dtpsv.$(SUFFIX) : tpsv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ctpsv.$(SUFFIX) : tpsv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+ztpsv.$(SUFFIX) : tpsv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+strsv.$(SUFFIX) : trsv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dtrsv.$(SUFFIX) : trsv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ctrsv.$(SUFFIX) : trsv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+ztrsv.$(SUFFIX) : trsv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+sger.$(SUFFIX) : ger.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dger.$(SUFFIX) : ger.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cger.$(SUFFIX) : ger.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zger.$(SUFFIX) : ger.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+ssymv.$(SUFFIX) : symv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dsymv.$(SUFFIX) : symv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+csymv.$(SUFFIX) : symv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zsymv.$(SUFFIX) : symv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+sgeev.$(SUFFIX) : geev.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dgeev.$(SUFFIX) : geev.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cgeev.$(SUFFIX) : geev.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgeev.$(SUFFIX) : geev.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+sgetri.$(SUFFIX) : getri.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dgetri.$(SUFFIX) : getri.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cgetri.$(SUFFIX) : getri.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgetri.$(SUFFIX) : getri.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+spotrf.$(SUFFIX) : potrf.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dpotrf.$(SUFFIX) : potrf.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cpotrf.$(SUFFIX) : potrf.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zpotrf.$(SUFFIX) : potrf.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+chemv.$(SUFFIX) : hemv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zhemv.$(SUFFIX) : hemv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+chbmv.$(SUFFIX) : hbmv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zhbmv.$(SUFFIX) : hbmv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+chpmv.$(SUFFIX) : hpmv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zhpmv.$(SUFFIX) : hpmv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+sdot.$(SUFFIX) : dot.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+ddot.$(SUFFIX) : dot.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cdot.$(SUFFIX) : zdot.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zdot.$(SUFFIX) : zdot.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+cdot-intel.$(SUFFIX) : zdot-intel.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zdot-intel.$(SUFFIX) : zdot-intel.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+
+saxpy.$(SUFFIX) : axpy.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+daxpy.$(SUFFIX) : axpy.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+caxpy.$(SUFFIX) : axpy.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zaxpy.$(SUFFIX) : axpy.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+saxpby.$(SUFFIX) : axpby.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+daxpby.$(SUFFIX) : axpby.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+caxpby.$(SUFFIX) : axpby.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zaxpby.$(SUFFIX) : axpby.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+scopy.$(SUFFIX) : copy.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dcopy.$(SUFFIX) : copy.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+ccopy.$(SUFFIX) : copy.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zcopy.$(SUFFIX) : copy.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+sswap.$(SUFFIX) : swap.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dswap.$(SUFFIX) : swap.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cswap.$(SUFFIX) : swap.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zswap.$(SUFFIX) : swap.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+
+sscal.$(SUFFIX) : scal.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dscal.$(SUFFIX) : scal.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cscal.$(SUFFIX) : scal.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zscal.$(SUFFIX) : scal.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+sasum.$(SUFFIX) : asum.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dasum.$(SUFFIX) : asum.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+casum.$(SUFFIX) : asum.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zasum.$(SUFFIX) : asum.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+sgesv.$(SUFFIX) : gesv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dgesv.$(SUFFIX) : gesv.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+cgesv.$(SUFFIX) : gesv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgesv.$(SUFFIX) : gesv.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+srot.$(SUFFIX) : rot.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+drot.$(SUFFIX) : rot.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+csrot.$(SUFFIX) : rot.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zdrot.$(SUFFIX) : rot.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+	
+srotm.$(SUFFIX) : rotm.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+drotm.$(SUFFIX) : rotm.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+
+cgemm3m.$(SUFFIX) : gemm3m.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zgemm3m.$(SUFFIX) : gemm3m.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+isamax.$(SUFFIX) : iamax.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+idamax.$(SUFFIX) : iamax.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+icamax.$(SUFFIX) : iamax.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+izamax.$(SUFFIX) : iamax.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+ismax.$(SUFFIX) : imax.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+idmax.$(SUFFIX) : imax.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+isamin.$(SUFFIX) : iamin.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+idamin.$(SUFFIX) : iamin.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+icamin.$(SUFFIX) : iamin.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+izamin.$(SUFFIX) : iamin.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+ismin.$(SUFFIX) : imin.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+idmin.$(SUFFIX) : imin.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+samax.$(SUFFIX) : amax.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX  -UDOUBLE -o $(@F) $^
+
+damax.$(SUFFIX) : amax.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX  -DDOUBLE -o $(@F) $^
+
+camax.$(SUFFIX) : amax.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX  -UDOUBLE -o $(@F) $^
+
+zamax.$(SUFFIX) : amax.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX  -DDOUBLE -o $(@F) $^
+
+
+smax.$(SUFFIX) : max.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX  -UDOUBLE -o $(@F) $^
+
+dmax.$(SUFFIX) : max.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX  -DDOUBLE -o $(@F) $^
+
+
+samin.$(SUFFIX) : amin.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+damin.$(SUFFIX) : amin.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+camin.$(SUFFIX) : amin.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+zamin.$(SUFFIX) : amin.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+smin.$(SUFFIX) : min.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dmin.$(SUFFIX) : min.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+snrm2.$(SUFFIX) : nrm2.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
+
+dnrm2.$(SUFFIX) : nrm2.c
+	$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
+
+scnrm2.$(SUFFIX) : nrm2.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
+
+dznrm2.$(SUFFIX) : nrm2.c
+	$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
+
+
+smallscaling: smallscaling.c ../$(LIBNAME)
+	$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread
+
+clean ::
+	@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
+
+include $(TOPDIR)/Makefile.tail
diff --git a/benchmark/amax.c b/benchmark/amax.c
index 29310dd71..446ba4c07 100644
--- a/benchmark/amax.c
+++ b/benchmark/amax.c
@@ -1,133 +1,133 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef AMAX
-
-#ifdef COMPLEX
-#ifdef DOUBLE
-#define AMAX BLASFUNC(dzamax)
-#else
-#define AMAX BLASFUNC(scamax)
-#endif
-#else
-#ifdef DOUBLE
-#define AMAX BLASFUNC(damax)
-#else
-#define AMAX BLASFUNC(samax)
-#endif
-#endif
-
-int main(int argc, char *argv[])
-{
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x = 1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from = 1;
-  int to = 200;
-  int step = 1;
-
-  double time1, timeg;
-
-  argc--;
-  argv++;
-
-  if (argc > 0)
-  {
-    from = atol(*argv);
-    argc--;
-    argv++;
-  }
-  if (argc > 0)
-  {
-    to = MAX(atol(*argv), from);
-    argc--;
-    argv++;
-  }
-  if (argc > 0)
-  {
-    step = atol(*argv);
-    argc--;
-    argv++;
-  }
-
-  if ((p = getenv("OPENBLAS_LOOPS")))
-    loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))
-    inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
-
-  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
-  {
-    fprintf(stderr, "Out of Memory!!\n");
-    exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for (m = from; m <= to; m += step)
-  {
-
-    timeg = 0;
-    fprintf(stderr, " %6d : ", (int)m);
-
-    for (l = 0; l < loops; l++)
-    {
-
-      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
-      {
-        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
-      }
-
-      begin();
-      AMAX(&m, x, &inc_x);
-      end();
-      timeg += getsec();
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-            " %10.2f MFlops %10.6f sec\n",
-            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef AMAX
+
+#ifdef COMPLEX
+#ifdef DOUBLE
+#define AMAX BLASFUNC(dzamax)
+#else
+#define AMAX BLASFUNC(scamax)
+#endif
+#else
+#ifdef DOUBLE
+#define AMAX BLASFUNC(damax)
+#else
+#define AMAX BLASFUNC(samax)
+#endif
+#endif
+
+int main(int argc, char *argv[])
+{
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x = 1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from = 1;
+  int to = 200;
+  int step = 1;
+
+  double time1, timeg;
+
+  argc--;
+  argv++;
+
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
+
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
+
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for (m = from; m <= to; m += step)
+  {
+
+    timeg = 0;
+    fprintf(stderr, " %6d : ", (int)m);
+
+    for (l = 0; l < loops; l++)
+    {
+
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
+
+      begin();
+      AMAX(&m, x, &inc_x);
+      end();
+      timeg += getsec();
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+            " %10.2f MFlops %10.6f sec\n",
+            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/amin.c b/benchmark/amin.c
index 54a1d266a..44f15a7f8 100644
--- a/benchmark/amin.c
+++ b/benchmark/amin.c
@@ -1,137 +1,137 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef AMIN
-
-#ifdef COMPLEX
-#ifdef DOUBLE
-#define AMIN BLASFUNC(dzamin)
-#else
-#define AMIN BLASFUNC(scamin)
-#endif
-#else
-#ifdef DOUBLE
-#define AMIN BLASFUNC(damin)
-#else
-#define AMIN BLASFUNC(samin)
-#endif
-#endif
-
-int main(int argc, char *argv[])
-{
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x = 1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from = 1;
-  int to = 200;
-  int step = 1;
-
-  double time1, timeg;
-
-  argc--;
-  argv++;
-
-  if (argc > 0)
-  {
-    from = atol(*argv);
-    argc--;
-    argv++;
-  }
-  if (argc > 0)
-  {
-    to = MAX(atol(*argv), from);
-    argc--;
-    argv++;
-  }
-  if (argc > 0)
-  {
-    step = atol(*argv);
-    argc--;
-    argv++;
-  }
-
-  if ((p = getenv("OPENBLAS_LOOPS")))
-    loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))
-    inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
-
-  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
-  {
-    fprintf(stderr, "Out of Memory!!\n");
-    exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for (m = from; m <= to; m += step)
-  {
-
-    timeg = 0;
-
-    fprintf(stderr, " %6d : ", (int)m);
-
-    for (l = 0; l < loops; l++)
-    {
-
-      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
-      {
-        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
-      }
-
-      begin();
-
-      AMIN(&m, x, &inc_x);
-
-      end();
-
-      timeg += getsec();
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-            " %10.2f MFlops %10.6f sec\n",
-            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef AMIN
+
+#ifdef COMPLEX
+#ifdef DOUBLE
+#define AMIN BLASFUNC(dzamin)
+#else
+#define AMIN BLASFUNC(scamin)
+#endif
+#else
+#ifdef DOUBLE
+#define AMIN BLASFUNC(damin)
+#else
+#define AMIN BLASFUNC(samin)
+#endif
+#endif
+
+int main(int argc, char *argv[])
+{
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x = 1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from = 1;
+  int to = 200;
+  int step = 1;
+
+  double time1, timeg;
+
+  argc--;
+  argv++;
+
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
+
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
+
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for (m = from; m <= to; m += step)
+  {
+
+    timeg = 0;
+
+    fprintf(stderr, " %6d : ", (int)m);
+
+    for (l = 0; l < loops; l++)
+    {
+
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
+
+      begin();
+
+      AMIN(&m, x, &inc_x);
+
+      end();
+
+      timeg += getsec();
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+            " %10.2f MFlops %10.6f sec\n",
+            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c
index 35249bdf9..7bf047abd 100644
--- a/benchmark/hbmv.c
+++ b/benchmark/hbmv.c
@@ -1,134 +1,134 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef HBMV
-
-#ifdef DOUBLE
-#define HBMV   BLASFUNC(zhbmv)
-#else
-#define HBMV   BLASFUNC(chbmv)
-#endif
-
-int main(int argc, char *argv[]){
-
-    FLOAT *a, *x, *y;
-    FLOAT alpha[] = {1.0, 1.0};
-    FLOAT beta [] = {0.0, 0.0};
-    blasint k = 1;
-    char uplo='L';
-    blasint m, i, j;
-    blasint inc_x=1, inc_y=1;
-    int loops = 1;
-    int l;
-    char *p;
-
-    int from =   1;
-    int to   = 200;
-    int step =   1;
-
-    double time1,timeg;
-
-    argc--;argv++;
-
-    if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-    if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-    if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-    if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-    if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-    if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
-    if ((p = getenv("OPENBLAS_UPLO")))   uplo=*p;
-    if ((p = getenv("OPENBLAS_K")))      k = atoi(p);
-    
-    fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", 
-                            from, to, step, uplo, k, inc_x, inc_y, loops);
-
-    if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-    if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-    if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-#ifdef __linux
-    srandom(getpid());
-#endif
-
-    fprintf(stderr, "   SIZE       Flops\n");
-
-    for(m = from; m <= to; m += step) {
-
-        timeg=0;
-
-        fprintf(stderr, " %6dx%d : ", (int)m, (int)m);
-
-        for(j = 0; j < m; j++) {
-            for(i = 0; i < m * COMPSIZE; i++) {
-                a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-            }
-        }
-
-    for (l = 0; l < loops; l++) {
-
-        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
-            x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-        }
-
-        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
-            y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-        }
-
-        begin();
-
-        HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
-
-        end();
-
-        timeg += getsec();
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr, " %10.2f MFlops\n",
-            COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6);
-    }
-
-    return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef HBMV
+
+#ifdef DOUBLE
+#define HBMV   BLASFUNC(zhbmv)
+#else
+#define HBMV   BLASFUNC(chbmv)
+#endif
+
+int main(int argc, char *argv[]){
+
+    FLOAT *a, *x, *y;
+    FLOAT alpha[] = {1.0, 1.0};
+    FLOAT beta [] = {0.0, 0.0};
+    blasint k = 1;
+    char uplo='L';
+    blasint m, i, j;
+    blasint inc_x=1, inc_y=1;
+    int loops = 1;
+    int l;
+    char *p;
+
+    int from =   1;
+    int to   = 200;
+    int step =   1;
+
+    double time1,timeg;
+
+    argc--;argv++;
+
+    if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+    if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+    if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+    if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+    if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+    if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
+    if ((p = getenv("OPENBLAS_UPLO")))   uplo=*p;
+    if ((p = getenv("OPENBLAS_K")))      k = atoi(p);
+    
+    fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", 
+                            from, to, step, uplo, k, inc_x, inc_y, loops);
+
+    if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+    if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+    if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+#ifdef __linux
+    srandom(getpid());
+#endif
+
+    fprintf(stderr, "   SIZE       Flops\n");
+
+    for(m = from; m <= to; m += step) {
+
+        timeg=0;
+
+        fprintf(stderr, " %6dx%d : ", (int)m, (int)m);
+
+        for(j = 0; j < m; j++) {
+            for(i = 0; i < m * COMPSIZE; i++) {
+                a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+            }
+        }
+
+    for (l = 0; l < loops; l++) {
+
+        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
+            x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+        }
+
+        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
+            y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+        }
+
+        begin();
+
+        HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
+
+        end();
+
+        timeg += getsec();
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr, " %10.2f MFlops\n",
+            COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6);
+    }
+
+    return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c
index 907e2adc4..0dc296ccc 100644
--- a/benchmark/hpmv.c
+++ b/benchmark/hpmv.c
@@ -1,133 +1,133 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef HPMV
-
-#ifdef DOUBLE
-#define HPMV   BLASFUNC(zhpmv)
-#else
-#define HPMV   BLASFUNC(chpmv)
-#endif
-
-int main(int argc, char *argv[]){
-
-    FLOAT *a, *x, *y;
-    FLOAT alpha[] = {1.0, 1.0};
-    FLOAT beta [] = {1.0, 1.0};
-    char uplo='L';
-    blasint m, i, j;
-    blasint inc_x=1, inc_y=1;
-    int loops = 1;
-    int l;
-    char *p;
-
-    int from =   1;
-    int to   = 200;
-    int step =   1;
-
-    double time1,timeg;
-
-    argc--;argv++;
-
-    if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-    if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-    if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-    if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-    if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-    if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
-    if ((p = getenv("OPENBLAS_UPLO")))   uplo=*p;
-
-    fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
-
-    if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-    if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-    if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) {
-        fprintf(stderr,"Out of Memory!!\n");
-        exit(1);
-    }
-
-#ifdef __linux
-    srandom(getpid());
-#endif
-
-    fprintf(stderr, "   SIZE       Flops\n");
-
-    for(m = from; m <= to; m += step) {
-
-        timeg=0;
-
-        fprintf(stderr, " %6dx%d : ", (int)m, (int)m);
-
-        for(j = 0; j < m; j++) {
-            for(i = 0; i < m * COMPSIZE; i++) {
-                a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-            }
-        }
-
-    for (l = 0; l < loops; l++) {
-
-        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
-            x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-        }
-
-        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
-            y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-        }
-
-        begin();
-
-        HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
-
-        end();
-
-        time1 = getsec();
-
-        timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr, " %10.2f MFlops\n",
-            COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
-    }
-
-    return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef HPMV
+
+#ifdef DOUBLE
+#define HPMV   BLASFUNC(zhpmv)
+#else
+#define HPMV   BLASFUNC(chpmv)
+#endif
+
+int main(int argc, char *argv[]){
+
+    FLOAT *a, *x, *y;
+    FLOAT alpha[] = {1.0, 1.0};
+    FLOAT beta [] = {1.0, 1.0};
+    char uplo='L';
+    blasint m, i, j;
+    blasint inc_x=1, inc_y=1;
+    int loops = 1;
+    int l;
+    char *p;
+
+    int from =   1;
+    int to   = 200;
+    int step =   1;
+
+    double time1,timeg;
+
+    argc--;argv++;
+
+    if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+    if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+    if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+    if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+    if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+    if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
+    if ((p = getenv("OPENBLAS_UPLO")))   uplo=*p;
+
+    fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
+
+    if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+    if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+    if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) {
+        fprintf(stderr,"Out of Memory!!\n");
+        exit(1);
+    }
+
+#ifdef __linux
+    srandom(getpid());
+#endif
+
+    fprintf(stderr, "   SIZE       Flops\n");
+
+    for(m = from; m <= to; m += step) {
+
+        timeg=0;
+
+        fprintf(stderr, " %6dx%d : ", (int)m, (int)m);
+
+        for(j = 0; j < m; j++) {
+            for(i = 0; i < m * COMPSIZE; i++) {
+                a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+            }
+        }
+
+    for (l = 0; l < loops; l++) {
+
+        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
+            x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+        }
+
+        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
+            y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+        }
+
+        begin();
+
+        HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
+
+        end();
+
+        time1 = getsec();
+
+        timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr, " %10.2f MFlops\n",
+            COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
+    }
+
+    return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/iamin.c b/benchmark/iamin.c
index a57638ecc..2384641a5 100644
--- a/benchmark/iamin.c
+++ b/benchmark/iamin.c
@@ -1,120 +1,120 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef IAMIN
-
-#ifdef COMPLEX
-#ifdef DOUBLE
-#define IAMIN   BLASFUNC(izamin)
-#else
-#define IAMIN   BLASFUNC(icamin)
-#endif
-#else
-#ifdef DOUBLE
-#define IAMIN   BLASFUNC(idamin)
-#else
-#define IAMIN   BLASFUNC(isamin)
-#endif
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
-
-
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	begin();
-
-    	IAMIN (&m, x, &inc_x);
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
-  }
-  
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef IAMIN
+
+#ifdef COMPLEX
+#ifdef DOUBLE
+#define IAMIN   BLASFUNC(izamin)
+#else
+#define IAMIN   BLASFUNC(icamin)
+#endif
+#else
+#ifdef DOUBLE
+#define IAMIN   BLASFUNC(idamin)
+#else
+#define IAMIN   BLASFUNC(isamin)
+#endif
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+    	begin();
+
+    	IAMIN (&m, x, &inc_x);
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+
+  }
+  
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/imax.c b/benchmark/imax.c
index b96b17167..e758d054e 100644
--- a/benchmark/imax.c
+++ b/benchmark/imax.c
@@ -1,114 +1,114 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef IMAX
-
-#ifndef COMPLEX
-#ifdef DOUBLE
-#define IMAX   BLASFUNC(idmax)
-#else
-#define IMAX   BLASFUNC(ismax)
-#endif
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
-
-
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	begin();
-
-    	IMAX (&m, x, &inc_x);
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef IMAX
+
+#ifndef COMPLEX
+#ifdef DOUBLE
+#define IMAX   BLASFUNC(idmax)
+#else
+#define IMAX   BLASFUNC(ismax)
+#endif
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+    	begin();
+
+    	IMAX (&m, x, &inc_x);
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/imin.c b/benchmark/imin.c
index 095eacca9..b279a0790 100644
--- a/benchmark/imin.c
+++ b/benchmark/imin.c
@@ -1,114 +1,114 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef IMIN
-
-#ifndef COMPLEX
-#ifdef DOUBLE
-#define IMIN   BLASFUNC(idmin)
-#else
-#define IMIN   BLASFUNC(ismin)
-#endif
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
-
-
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	begin();
-
-    	IMIN (&m, x, &inc_x);
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef IMIN
+
+#ifndef COMPLEX
+#ifdef DOUBLE
+#define IMIN   BLASFUNC(idmin)
+#else
+#define IMIN   BLASFUNC(ismin)
+#endif
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+    	begin();
+
+    	IMIN (&m, x, &inc_x);
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/max.c b/benchmark/max.c
index 301b943a5..eb142fc8a 100644
--- a/benchmark/max.c
+++ b/benchmark/max.c
@@ -1,113 +1,113 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef NAMAX
-
-#ifndef COMPLEX
-#ifdef DOUBLE
-#define NAMAX   BLASFUNC(dmax)
-#else
-#define NAMAX   BLASFUNC(smax)
-#endif
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
-
-
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	begin();
-
-    	NAMAX (&m, x, &inc_x);
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef NAMAX
+
+#ifndef COMPLEX
+#ifdef DOUBLE
+#define NAMAX   BLASFUNC(dmax)
+#else
+#define NAMAX   BLASFUNC(smax)
+#endif
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+    	begin();
+
+    	NAMAX (&m, x, &inc_x);
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/min.c b/benchmark/min.c
index 39df37a29..9c213c951 100644
--- a/benchmark/min.c
+++ b/benchmark/min.c
@@ -1,113 +1,113 @@
-/***************************************************************************
-Copyright (c) 2016, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef NAMIN
-
-#ifndef COMPLEX
-#ifdef DOUBLE
-#define NAMIN   BLASFUNC(dmin)
-#else
-#define NAMIN   BLASFUNC(smin)
-#endif
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *x;
-  blasint m, i;
-  blasint inc_x=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
-
-
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	begin();
-
-    	NAMIN (&m, x, &inc_x);
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef NAMIN
+
+#ifndef COMPLEX
+#ifdef DOUBLE
+#define NAMIN   BLASFUNC(dmin)
+#else
+#define NAMIN   BLASFUNC(smin)
+#endif
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *x;
+  blasint m, i;
+  blasint inc_x=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6d : ", (int)m);
+
+
+   for (l=0; l<loops; l++)
+   {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+    	begin();
+
+    	NAMIN (&m, x, &inc_x);
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops %10.6f sec\n",
+	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
diff --git a/benchmark/rotm.c b/benchmark/rotm.c
index 7f333e220..9dc6e10f1 100644
--- a/benchmark/rotm.c
+++ b/benchmark/rotm.c
@@ -1,138 +1,138 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
-GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
-THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef ROTM
-
-#ifdef DOUBLE
-#define ROTM BLASFUNC(drotm)
-#else
-#define ROTM BLASFUNC(srotm)
-#endif
-
-int main(int argc, char *argv[])
-{
-
-    FLOAT *x, *y;
-    // FLOAT result;
-    blasint m, i;
-    blasint inc_x = 1, inc_y = 1;
-    FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0};
-    int loops = 1;
-    int l;
-    char *p;
-
-    int from = 1;
-    int to = 200;
-    int step = 1;
-
-    
-    double time1, timeg;
-
-    argc--;
-    argv++;
-
-    if (argc > 0) {
-        from = atol(*argv);
-        argc--;
-        argv++;
-    }
-    if (argc > 0) {
-        to = MAX(atol(*argv), from);
-        argc--;
-        argv++;
-    }
-    if (argc > 0) {
-        step = atol(*argv);
-        argc--;
-        argv++;
-    }
-
-    if ((p = getenv("OPENBLAS_LOOPS")))
-        loops = atoi(p);
-    if ((p = getenv("OPENBLAS_INCX")))
-        inc_x = atoi(p);
-    if ((p = getenv("OPENBLAS_INCY")))
-        inc_y = atoi(p);
-
-    fprintf(
-        stderr,
-        "From : %3d  To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n",
-        from, to, step, inc_x, inc_y, loops);
-
-    if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) ==
-        NULL) {
-        fprintf(stderr, "Out of Memory!!\n");
-        exit(1);
-    }
-
-    if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) ==
-        NULL) {
-        fprintf(stderr, "Out of Memory!!\n");
-        exit(1);
-    }
-
-#ifdef __linux
-    srandom(getpid());
-#endif
-
-    fprintf(stderr, "   SIZE       Flops\n");
-
-    for (m = from; m <= to; m += step) {
-
-        timeg = 0;
-
-        fprintf(stderr, " %6d : ", (int)m);
-        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
-            x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
-        }
-
-        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
-            y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
-        }
-
-        for (l = 0; l < loops; l++) {
-            begin();
-
-            ROTM(&m, x, &inc_x, y, &inc_y, param);
-
-            end();
-
-            time1 = getsec();
-
-            timeg += time1;
-        }
-
-        timeg /= loops;
-
-        fprintf(stderr, " %10.2f MFlops %10.6f sec\n",
-                COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);
-    }
-
-    return 0;
-}
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef ROTM
+
+#ifdef DOUBLE
+#define ROTM BLASFUNC(drotm)
+#else
+#define ROTM BLASFUNC(srotm)
+#endif
+
+int main(int argc, char *argv[])
+{
+
+    FLOAT *x, *y;
+    // FLOAT result;
+    blasint m, i;
+    blasint inc_x = 1, inc_y = 1;
+    FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0};
+    int loops = 1;
+    int l;
+    char *p;
+
+    int from = 1;
+    int to = 200;
+    int step = 1;
+
+    
+    double time1, timeg;
+
+    argc--;
+    argv++;
+
+    if (argc > 0) {
+        from = atol(*argv);
+        argc--;
+        argv++;
+    }
+    if (argc > 0) {
+        to = MAX(atol(*argv), from);
+        argc--;
+        argv++;
+    }
+    if (argc > 0) {
+        step = atol(*argv);
+        argc--;
+        argv++;
+    }
+
+    if ((p = getenv("OPENBLAS_LOOPS")))
+        loops = atoi(p);
+    if ((p = getenv("OPENBLAS_INCX")))
+        inc_x = atoi(p);
+    if ((p = getenv("OPENBLAS_INCY")))
+        inc_y = atoi(p);
+
+    fprintf(
+        stderr,
+        "From : %3d  To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n",
+        from, to, step, inc_x, inc_y, loops);
+
+    if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) ==
+        NULL) {
+        fprintf(stderr, "Out of Memory!!\n");
+        exit(1);
+    }
+
+    if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) ==
+        NULL) {
+        fprintf(stderr, "Out of Memory!!\n");
+        exit(1);
+    }
+
+#ifdef __linux
+    srandom(getpid());
+#endif
+
+    fprintf(stderr, "   SIZE       Flops\n");
+
+    for (m = from; m <= to; m += step) {
+
+        timeg = 0;
+
+        fprintf(stderr, " %6d : ", (int)m);
+        for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
+            x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+        }
+
+        for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
+            y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+        }
+
+        for (l = 0; l < loops; l++) {
+            begin();
+
+            ROTM(&m, x, &inc_x, y, &inc_y, param);
+
+            end();
+
+            time1 = getsec();
+
+            timeg += time1;
+        }
+
+        timeg /= loops;
+
+        fprintf(stderr, " %10.2f MFlops %10.6f sec\n",
+                COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);
+    }
+
+    return 0;
+}
diff --git a/benchmark/spmv.c b/benchmark/spmv.c
index e4dcbf4ae..1e62952ef 100644
--- a/benchmark/spmv.c
+++ b/benchmark/spmv.c
@@ -1,146 +1,146 @@
-/***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
-All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in
-the documentation and/or other materials provided with the
-distribution.
-3. Neither the name of the OpenBLAS project nor the names of
-its contributors may be used to endorse or promote products
-derived from this software without specific prior written permission.
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*****************************************************************************/
-
-#include "bench.h"
-
-#undef SPMV
-
-#ifndef COMPLEX
-
-#ifdef DOUBLE
-#define SPMV   BLASFUNC(dspmv)
-#else
-#define SPMV   BLASFUNC(sspmv)
-#endif
-
-#else
-
-#ifdef DOUBLE
-#define SPMV   BLASFUNC(zspmv)
-#else
-#define SPMV   BLASFUNC(cspmv)
-#endif
-
-#endif
-
-int main(int argc, char *argv[]){
-
-  FLOAT *a, *x, *y;
-  FLOAT alpha[] = {1.0, 1.0};
-  FLOAT beta [] = {1.0, 1.0};
-  char uplo='L';
-  blasint m, i, j;
-  blasint inc_x=1,inc_y=1;
-  int loops = 1;
-  int l;
-  char *p;
-
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  double time1,timeg;
-
-  argc--;argv++;
-
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
-
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
-  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
-  if ((p = getenv("OPENBLAS_UPLO")))  uplo=*p;
-
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
-
-  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
-#ifdef __linux
-  srandom(getpid());
-#endif
-
-  fprintf(stderr, "   SIZE       Flops\n");
-
-  for(m = from; m <= to; m += step)
-  {
-
-   timeg=0;
-
-   fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
-
-   for(j = 0; j < m; j++){
-      		for(i = 0; i < m * COMPSIZE; i++){
-			a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-      		}
-   }
-
-
-    for (l=0; l<loops; l++)
-    {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
-			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-    	begin();
-
-    	SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
-
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
-
-    timeg /= loops;
-
-    fprintf(stderr,
-	    " %10.2f MFlops\n",
-	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
-
-  }
-
-  return 0;
-}
-
-// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "bench.h"
+
+#undef SPMV
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+#define SPMV   BLASFUNC(dspmv)
+#else
+#define SPMV   BLASFUNC(sspmv)
+#endif
+
+#else
+
+#ifdef DOUBLE
+#define SPMV   BLASFUNC(zspmv)
+#else
+#define SPMV   BLASFUNC(cspmv)
+#endif
+
+#endif
+
+int main(int argc, char *argv[]){
+
+  FLOAT *a, *x, *y;
+  FLOAT alpha[] = {1.0, 1.0};
+  FLOAT beta [] = {1.0, 1.0};
+  char uplo='L';
+  blasint m, i, j;
+  blasint inc_x=1,inc_y=1;
+  int loops = 1;
+  int l;
+  char *p;
+
+  int from =   1;
+  int to   = 200;
+  int step =   1;
+
+  double time1,timeg;
+
+  argc--;argv++;
+
+  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
+  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
+  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+
+  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
+  if ((p = getenv("OPENBLAS_UPLO")))  uplo=*p;
+
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
+
+  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
+    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  }
+
+#ifdef __linux
+  srandom(getpid());
+#endif
+
+  fprintf(stderr, "   SIZE       Flops\n");
+
+  for(m = from; m <= to; m += step)
+  {
+
+   timeg=0;
+
+   fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
+
+   for(j = 0; j < m; j++){
+      		for(i = 0; i < m * COMPSIZE; i++){
+			a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+      		}
+   }
+
+
+    for (l=0; l<loops; l++)
+    {
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+
+   	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
+			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   	}
+    	begin();
+
+    	SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
+
+    	end();
+
+    	time1 = getsec();
+
+	timeg += time1;
+
+    }
+
+    timeg /= loops;
+
+    fprintf(stderr,
+	    " %10.2f MFlops\n",
+	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
+
+  }
+
+  return 0;
+}
+
+// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

From 4f82699ec90b2493463c711f59dc177433fd8879 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Nov 2022 18:06:17 +0100
Subject: [PATCH 103/154] Fix errors in LAPACKE ?tpmqrt for row major matrices
 (Reference-LAPACK PR540)

---
 .../LAPACKE/src/lapacke_ctpmqrt_work.c        | 32 ++++++++++++-------
 .../LAPACKE/src/lapacke_dtpmqrt_work.c        | 32 ++++++++++++-------
 .../LAPACKE/src/lapacke_stpmqrt_work.c        | 32 ++++++++++++-------
 .../LAPACKE/src/lapacke_ztpmqrt_work.c        | 32 ++++++++++++-------
 4 files changed, 80 insertions(+), 48 deletions(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c
index 5ec948e7b..e01664bdf 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c
@@ -50,16 +50,24 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,k);
+        lapack_int nrowsA, ncolsA, nrowsV;
+        if      ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; }
+        else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; }
+        else {
+            info = -2;
+            LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info );
+            return info;
+        }
+        lapack_int lda_t = MAX(1,nrowsA);
         lapack_int ldb_t = MAX(1,m);
-        lapack_int ldt_t = MAX(1,ldt);
-        lapack_int ldv_t = MAX(1,ldv);
+        lapack_int ldt_t = MAX(1,nb);
+        lapack_int ldv_t = MAX(1,nrowsV);
         lapack_complex_float* v_t = NULL;
         lapack_complex_float* t_t = NULL;
         lapack_complex_float* a_t = NULL;
         lapack_complex_float* b_t = NULL;
         /* Check leading dimension(s) */
-        if( lda < m ) {
+        if( lda < ncolsA ) {
             info = -14;
             LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info );
             return info;
@@ -69,7 +77,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans,
             LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info );
             return info;
         }
-        if( ldt < nb ) {
+        if( ldt < k ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info );
             return info;
@@ -87,13 +95,13 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans,
             goto exit_level_0;
         }
         t_t = (lapack_complex_float*)
-            LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,nb) );
+            LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,k) );
         if( t_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_1;
         }
         a_t = (lapack_complex_float*)
-            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) );
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,ncolsA) );
         if( a_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_2;
@@ -105,10 +113,10 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans,
             goto exit_level_3;
         }
         /* Transpose input matrices */
-        LAPACKE_cge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t );
-        LAPACKE_cge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t );
-        LAPACKE_cge_trans( matrix_layout, k, m, a, lda, a_t, lda_t );
-        LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t );
+        LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t );
+        LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t );
+        LAPACKE_cge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t );
         /* Call LAPACK function and adjust info */
         LAPACK_ctpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t,
                         &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info );
@@ -116,7 +124,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_cge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda );
         LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
         /* Release memory and exit */
         LAPACKE_free( b_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c
index d9ee6226b..366acd369 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c
@@ -48,16 +48,24 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,k);
+        lapack_int nrowsA, ncolsA, nrowsV;
+        if      ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; }
+        else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; }
+        else {
+            info = -2;
+            LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info );
+            return info;
+        }
+        lapack_int lda_t = MAX(1,nrowsA);
         lapack_int ldb_t = MAX(1,m);
-        lapack_int ldt_t = MAX(1,ldt);
-        lapack_int ldv_t = MAX(1,ldv);
+        lapack_int ldt_t = MAX(1,nb);
+        lapack_int ldv_t = MAX(1,nrowsV);
         double* v_t = NULL;
         double* t_t = NULL;
         double* a_t = NULL;
         double* b_t = NULL;
         /* Check leading dimension(s) */
-        if( lda < m ) {
+        if( lda < ncolsA ) {
             info = -14;
             LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info );
             return info;
@@ -67,7 +75,7 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans,
             LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info );
             return info;
         }
-        if( ldt < nb ) {
+        if( ldt < k ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info );
             return info;
@@ -83,12 +91,12 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans,
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_0;
         }
-        t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,nb) );
+        t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,k) );
         if( t_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_1;
         }
-        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,m) );
+        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,ncolsA) );
         if( a_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_2;
@@ -99,10 +107,10 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans,
             goto exit_level_3;
         }
         /* Transpose input matrices */
-        LAPACKE_dge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t );
-        LAPACKE_dge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t );
-        LAPACKE_dge_trans( matrix_layout, k, m, a, lda, a_t, lda_t );
-        LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t );
+        LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t );
+        LAPACKE_dge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t );
+        LAPACKE_dge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t );
         /* Call LAPACK function and adjust info */
         LAPACK_dtpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t,
                         &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info );
@@ -110,7 +118,7 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_dge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda );
         LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
         /* Release memory and exit */
         LAPACKE_free( b_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c
index 095fbdcd9..c5a3a1496 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c
@@ -48,16 +48,24 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,k);
+        lapack_int nrowsA, ncolsA, nrowsV;
+        if      ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; }
+        else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; }
+        else {
+            info = -2;
+            LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info );
+            return info;
+        }
+        lapack_int lda_t = MAX(1,nrowsA);
         lapack_int ldb_t = MAX(1,m);
-        lapack_int ldt_t = MAX(1,ldt);
-        lapack_int ldv_t = MAX(1,ldv);
+        lapack_int ldt_t = MAX(1,nb);
+        lapack_int ldv_t = MAX(1,nrowsV);
         float* v_t = NULL;
         float* t_t = NULL;
         float* a_t = NULL;
         float* b_t = NULL;
         /* Check leading dimension(s) */
-        if( lda < m ) {
+        if( lda < ncolsA ) {
             info = -14;
             LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info );
             return info;
@@ -67,7 +75,7 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans,
             LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info );
             return info;
         }
-        if( ldt < nb ) {
+        if( ldt < k ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info );
             return info;
@@ -83,12 +91,12 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans,
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_0;
         }
-        t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,nb) );
+        t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,k) );
         if( t_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_1;
         }
-        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,m) );
+        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,ncolsA) );
         if( a_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_2;
@@ -99,10 +107,10 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans,
             goto exit_level_3;
         }
         /* Transpose input matrices */
-        LAPACKE_sge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t );
-        LAPACKE_sge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t );
-        LAPACKE_sge_trans( matrix_layout, k, m, a, lda, a_t, lda_t );
-        LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t );
+        LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t );
+        LAPACKE_sge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t );
+        LAPACKE_sge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t );
         /* Call LAPACK function and adjust info */
         LAPACK_stpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t,
                         &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info );
@@ -110,7 +118,7 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_sge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda );
         LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
         /* Release memory and exit */
         LAPACKE_free( b_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c
index 643ae1d9d..104efa8f3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c
@@ -50,16 +50,24 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
-        lapack_int lda_t = MAX(1,k);
+        lapack_int nrowsA, ncolsA, nrowsV;
+        if      ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; }
+        else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; }
+        else {
+            info = -2;
+            LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info );
+            return info;
+        }
+        lapack_int lda_t = MAX(1,nrowsA);
         lapack_int ldb_t = MAX(1,m);
-        lapack_int ldt_t = MAX(1,ldt);
-        lapack_int ldv_t = MAX(1,ldv);
+        lapack_int ldt_t = MAX(1,nb);
+        lapack_int ldv_t = MAX(1,nrowsV);
         lapack_complex_double* v_t = NULL;
         lapack_complex_double* t_t = NULL;
         lapack_complex_double* a_t = NULL;
         lapack_complex_double* b_t = NULL;
         /* Check leading dimension(s) */
-        if( lda < m ) {
+        if( lda < ncolsA ) {
             info = -14;
             LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info );
             return info;
@@ -69,7 +77,7 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans,
             LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info );
             return info;
         }
-        if( ldt < nb ) {
+        if( ldt < k ) {
             info = -12;
             LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info );
             return info;
@@ -87,13 +95,13 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans,
             goto exit_level_0;
         }
         t_t = (lapack_complex_double*)
-            LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,nb) );
+            LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,k) );
         if( t_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_1;
         }
         a_t = (lapack_complex_double*)
-            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,m) );
+            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,ncolsA) );
         if( a_t == NULL ) {
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_2;
@@ -105,10 +113,10 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans,
             goto exit_level_3;
         }
         /* Transpose input matrices */
-        LAPACKE_zge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t );
-        LAPACKE_zge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t );
-        LAPACKE_zge_trans( matrix_layout, k, m, a, lda, a_t, lda_t );
-        LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t );
+        LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t );
+        LAPACKE_zge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t );
+        LAPACKE_zge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t );
         /* Call LAPACK function and adjust info */
         LAPACK_ztpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t,
                         &ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info );
@@ -116,7 +124,7 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_zge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda );
         LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
         /* Release memory and exit */
         LAPACKE_free( b_t );

From 6c9dbe5afa8ba39a93734ad27188cba4048f81a8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Nov 2022 18:09:49 +0100
Subject: [PATCH 104/154] Add a LAPACKE interface for ?LANGB (Reference-LAPACK
 PR725)

---
 lapack-netlib/LAPACKE/src/lapacke_clangb.c    | 73 ++++++++++++++++
 .../LAPACKE/src/lapacke_clangb_work.c         | 84 +++++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_dlangb.c    | 73 ++++++++++++++++
 .../LAPACKE/src/lapacke_dlangb_work.c         | 83 ++++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_slangb.c    | 73 ++++++++++++++++
 .../LAPACKE/src/lapacke_slangb_work.c         | 83 ++++++++++++++++++
 lapack-netlib/LAPACKE/src/lapacke_zlangb.c    | 73 ++++++++++++++++
 7 files changed, 542 insertions(+)
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_clangb.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_clangb_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dlangb.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_slangb.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_slangb_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zlangb.c

diff --git a/lapack-netlib/LAPACKE/src/lapacke_clangb.c b/lapack-netlib/LAPACKE/src/lapacke_clangb.c
new file mode 100644
index 000000000..0d61575aa
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_clangb.c
@@ -0,0 +1,73 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function clangb
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n,
+                      lapack_int kl, lapack_int ku,
+                      const lapack_complex_float* ab, lapack_int ldab )
+{
+    lapack_int info = 0;
+    float res = 0.;
+    float* work = NULL;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_clangb", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) {
+            return -6;
+        }
+    }
+#endif
+    /* Allocate memory for working array(s) */
+    if( LAPACKE_lsame( norm, 'i' ) ) {
+        work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
+        if( work == NULL ) {
+            info = LAPACK_WORK_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+    }
+    /* Call middle-level interface */
+    res = LAPACKE_clangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work );
+    /* Release memory and exit */
+    if( LAPACKE_lsame( norm, 'i' ) ) {
+        LAPACKE_free( work );
+    }
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_clangb", info );
+    }
+    return res;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c
new file mode 100644
index 000000000..b5b2cf816
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_clangb_work.c
@@ -0,0 +1,84 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function clangb
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku,
+                           const lapack_complex_float* ab, lapack_int ldab,
+                           float* work )
+{
+    lapack_int info = 0;
+    float res = 0.;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        res = LAPACK_clangb( &norm, &n, &kl, &ku, ab, &ldab, work );
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        char norm_lapack;
+        float* work_lapack = NULL;
+        /* Check leading dimension(s) */
+        if( ldab < kl+ku+1 ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_clangb_work", info );
+            return info;
+        }
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        /* Allocate memory for work array(s) */
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
+            if( work_lapack == NULL ) {
+                info = LAPACK_WORK_MEMORY_ERROR;
+                goto exit_level_0;
+            }
+        }
+        /* Call LAPACK function */
+        res = LAPACK_clangb( &norm, &n, &ku, &kl, ab, &ldab, work );
+        /* Release memory and exit */
+        if( work_lapack ) {
+            LAPACKE_free( work_lapack );
+        }
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_clangb_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_clangb_work", info );
+    }
+    return res;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlangb.c b/lapack-netlib/LAPACKE/src/lapacke_dlangb.c
new file mode 100644
index 000000000..ca16ea7f4
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlangb.c
@@ -0,0 +1,73 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dlangb
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n,
+                       lapack_int kl, lapack_int ku, const double* ab,
+                       lapack_int ldab )
+{
+    lapack_int info = 0;
+    double res = 0.;
+    double* work = NULL;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dlangb", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) {
+            return -6;
+        }
+    }
+#endif
+    /* Allocate memory for working array(s) */
+    if( LAPACKE_lsame( norm, 'i' ) ) {
+        work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) );
+        if( work == NULL ) {
+            info = LAPACK_WORK_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+    }
+    /* Call middle-level interface */
+    res = LAPACKE_dlangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work );
+    /* Release memory and exit */
+    if( LAPACKE_lsame( norm, 'i' ) ) {
+        LAPACKE_free( work );
+    }
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dlangb", info );
+    }
+    return res;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c
new file mode 100644
index 000000000..ba04c2b62
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlangb_work.c
@@ -0,0 +1,83 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dlangb
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n,
+                            lapack_int kl, lapack_int ku, const double* ab,
+                            lapack_int ldab, double* work )
+{
+    lapack_int info = 0;
+    double res = 0.;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        res = LAPACK_dlangb( &norm, &n, &kl, &ku, ab, &ldab, work );
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        char norm_lapack;
+        double* work_lapack = NULL;
+        /* Check leading dimension(s) */
+        if( ldab < kl+ku+1 ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_dlangb_work", info );
+            return info;
+        }
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        /* Allocate memory for work array(s) */
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) );
+            if( work_lapack == NULL ) {
+                info = LAPACK_WORK_MEMORY_ERROR;
+                goto exit_level_0;
+            }
+        }
+        /* Call LAPACK function */
+        res = LAPACK_dlangb( &norm, &n, &ku, &kl, ab, &ldab, work );
+        /* Release memory and exit */
+        if( work_lapack ) {
+            LAPACKE_free( work_lapack );
+        }
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dlangb_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dlangb_work", info );
+    }
+    return res;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slangb.c b/lapack-netlib/LAPACKE/src/lapacke_slangb.c
new file mode 100644
index 000000000..9ba3f30d8
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_slangb.c
@@ -0,0 +1,73 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function slangb
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n,
+                      lapack_int kl, lapack_int ku, const float* ab,
+                      lapack_int ldab )
+{
+    lapack_int info = 0;
+    float res = 0.;
+    float* work = NULL;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_slangb", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) {
+            return -6;
+        }
+    }
+#endif
+    /* Allocate memory for working array(s) */
+    if( LAPACKE_lsame( norm, 'i' ) ) {
+        work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
+        if( work == NULL ) {
+            info = LAPACK_WORK_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+    }
+    /* Call middle-level interface */
+    res = LAPACKE_slangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work );
+    /* Release memory and exit */
+    if( LAPACKE_lsame( norm, 'i' ) ) {
+        LAPACKE_free( work );
+    }
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_slangb", info );
+    }
+    return res;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c
new file mode 100644
index 000000000..7ef86e9d9
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_slangb_work.c
@@ -0,0 +1,83 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function slangb
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n,
+                           lapack_int kl, lapack_int ku, const float* ab,
+                           lapack_int ldab, float* work )
+{
+    lapack_int info = 0;
+    float res = 0.;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        res = LAPACK_slangb( &norm, &n, &kl, &ku, ab, &ldab, work );
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        char norm_lapack;
+        float* work_lapack = NULL;
+        /* Check leading dimension(s) */
+        if( ldab < kl+ku+1 ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_slangb_work", info );
+            return info;
+        }
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        /* Allocate memory for work array(s) */
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
+            if( work_lapack == NULL ) {
+                info = LAPACK_WORK_MEMORY_ERROR;
+                goto exit_level_0;
+            }
+        }
+        /* Call LAPACK function */
+        res = LAPACK_slangb( &norm, &n, &ku, &kl, ab, &ldab, work );
+        /* Release memory and exit */
+        if( work_lapack ) {
+            LAPACKE_free( work_lapack );
+        }
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_slangb_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_slangb_work", info );
+    }
+    return res;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlangb.c b/lapack-netlib/LAPACKE/src/lapacke_zlangb.c
new file mode 100644
index 000000000..3a22ad982
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlangb.c
@@ -0,0 +1,73 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zlangb
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n,
+                       lapack_int kl, lapack_int ku,
+                       const lapack_complex_double* ab, lapack_int ldab )
+{
+    lapack_int info = 0;
+    double res = 0.;
+    double* work = NULL;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zlangb", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) {
+            return -6;
+        }
+    }
+#endif
+    /* Allocate memory for working array(s) */
+    if( LAPACKE_lsame( norm, 'i' ) ) {
+        work = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) );
+        if( work == NULL ) {
+            info = LAPACK_WORK_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+    }
+    /* Call middle-level interface */
+    res = LAPACKE_zlangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work );
+    /* Release memory and exit */
+    if( LAPACKE_lsame( norm, 'i' ) ) {
+        LAPACKE_free( work );
+    }
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zlangb", info );
+    }
+    return res;
+}

From e4a31c0d23edbbc518c940b88623e6067fe9d0a2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Nov 2022 18:15:04 +0100
Subject: [PATCH 105/154] add ?LANGB interface (Reference-LAPACK PR725)

---
 cmake/lapacke.cmake | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index c740eceb4..3a9352197 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -318,6 +318,8 @@ set(CSRC
   lapacke_clacn2.c
   lapacke_clag2z.c
   lapacke_clag2z_work.c
+  lapacke_clangb.c
+  lapacke_clangb_work.c
   lapacke_clange.c
   lapacke_clange_work.c
   lapacke_clanhe.c
@@ -803,6 +805,8 @@ set(DSRC
   lapacke_dlag2s_work.c
   lapacke_dlamch.c
   lapacke_dlamch_work.c
+  lapacke_dlangb.c
+  lapacke_dlangb_work.c
   lapacke_dlange.c
   lapacke_dlange_work.c
   lapacke_dlansy.c
@@ -1381,6 +1385,8 @@ set(SSRC
   lapacke_slag2d_work.c
   lapacke_slamch.c
   lapacke_slamch_work.c
+  lapacke_slangb.c
+  lapacke_slangb_work.c
   lapacke_slange.c
   lapacke_slange_work.c
   lapacke_slansy.c
@@ -2089,6 +2095,8 @@ set(ZSRC
   lapacke_zlacrm_work.c
   lapacke_zlag2c.c
   lapacke_zlag2c_work.c
+  lapacke_zlangb.c
+  lapacke_zlangb_work.c
   lapacke_zlange.c
   lapacke_zlange_work.c
   lapacke_zlanhe.c

From 48c9c6efb92ee9abb2c835cca1167bb2673d2dfd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Nov 2022 18:19:14 +0100
Subject: [PATCH 106/154] Add ?LANGB interface (Reference-LAPACK PR725)

---
 lapack-netlib/LAPACKE/src/Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index 7f827e1c9..9c02c1445 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -358,6 +358,8 @@ lapacke_clacrm.o \
 lapacke_clacrm_work.o \
 lapacke_clag2z.o \
 lapacke_clag2z_work.o \
+lapacke_clangb.o \
+lapacke_clangb_work.o \
 lapacke_clange.o \
 lapacke_clange_work.o \
 lapacke_clanhe.o \
@@ -842,6 +844,8 @@ lapacke_dlag2s.o \
 lapacke_dlag2s_work.o \
 lapacke_dlamch.o \
 lapacke_dlamch_work.o \
+lapacke_dlangb.o \
+lapacke_dlangb_work.o \
 lapacke_dlange.o \
 lapacke_dlange_work.o \
 lapacke_dlansy.o \
@@ -1414,6 +1418,8 @@ lapacke_slacpy.o \
 lapacke_slacpy_work.o \
 lapacke_slamch.o \
 lapacke_slamch_work.o \
+lapacke_slangb.o \
+lapacke_slangb_work.o \
 lapacke_slange.o \
 lapacke_slange_work.o \
 lapacke_slansy.o \
@@ -2116,6 +2122,8 @@ lapacke_zlacrm.o \
 lapacke_zlacrm_work.o \
 lapacke_zlag2c.o \
 lapacke_zlag2c_work.o \
+lapacke_zlangb.o \
+lapacke_zlangb_work.o \
 lapacke_zlange.o \
 lapacke_zlange_work.o \
 lapacke_zlanhe.o \

From 9fe75af5280a0cb658f89c0fad3d6edb5f7b421a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 17 Nov 2022 19:46:53 +0100
Subject: [PATCH 107/154] Add a LAPACKE interface for ?LANGB (Reference-LAPACK
 PR725)

---
 .../LAPACKE/src/lapacke_zlangb_work.c         | 84 +++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c

diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c
new file mode 100644
index 000000000..d64fb482d
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlangb_work.c
@@ -0,0 +1,84 @@
+/*****************************************************************************
+  Copyright (c) 2022, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zlangb
+* Author: Simon Märtens
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n,
+                            lapack_int kl, lapack_int ku,
+                            const lapack_complex_double* ab, lapack_int ldab,
+                            double* work )
+{
+    lapack_int info = 0;
+    double res = 0.;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        res = LAPACK_zlangb( &norm, &n, &kl, &ku, ab, &ldab, work );
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        char norm_lapack;
+        double* work_lapack = NULL;
+        /* Check leading dimension(s) */
+        if( ldab < kl+ku+1 ) {
+            info = -7;
+            LAPACKE_xerbla( "LAPACKE_zlangb_work", info );
+            return info;
+        }
+        if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
+            norm_lapack = 'i';
+        } else if( LAPACKE_lsame( norm, 'i' ) ) {
+            norm_lapack = '1';
+        } else {
+            norm_lapack = norm;
+        }
+        /* Allocate memory for work array(s) */
+        if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
+            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) );
+            if( work_lapack == NULL ) {
+                info = LAPACK_WORK_MEMORY_ERROR;
+                goto exit_level_0;
+            }
+        }
+        /* Call LAPACK function */
+        res = LAPACK_zlangb( &norm, &n, &ku, &kl, ab, &ldab, work );
+        /* Release memory and exit */
+        if( work_lapack ) {
+            LAPACKE_free( work_lapack );
+        }
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zlangb_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zlangb_work", info );
+    }
+    return res;
+}

From 35295912a3f1b83ba8fd22f1fe2fccce6ff4a201 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 14:57:54 +0100
Subject: [PATCH 108/154] Define type conversions explicitly (Reference-LAPACK
 PR 703)

---
 lapack-netlib/SRC/cgebak.f     |  4 ++--
 lapack-netlib/SRC/cgees.f      |  2 +-
 lapack-netlib/SRC/cgeesx.f     |  2 +-
 lapack-netlib/SRC/cgejsv.f     | 36 +++++++++++++++++-----------------
 lapack-netlib/SRC/cggbak.f     |  8 ++++----
 lapack-netlib/SRC/cggbal.f     |  4 ++--
 lapack-netlib/SRC/cggglm.f     |  2 +-
 lapack-netlib/SRC/cgghd3.f     |  2 +-
 lapack-netlib/SRC/cgglse.f     |  2 +-
 lapack-netlib/SRC/cggqrf.f     |  2 +-
 lapack-netlib/SRC/chegvd.f     |  6 +++---
 lapack-netlib/SRC/chesv_rk.f   |  2 +-
 lapack-netlib/SRC/chpgvd.f     |  6 +++---
 lapack-netlib/SRC/csysv.f      |  2 +-
 lapack-netlib/SRC/csysv_rk.f   |  2 +-
 lapack-netlib/SRC/csysv_rook.f |  2 +-
 lapack-netlib/SRC/cungbr.f     |  2 +-
 17 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/lapack-netlib/SRC/cgebak.f b/lapack-netlib/SRC/cgebak.f
index 201dbfcec..4348d5ea4 100644
--- a/lapack-netlib/SRC/cgebak.f
+++ b/lapack-netlib/SRC/cgebak.f
@@ -238,7 +238,7 @@
      $            GO TO 40
                IF( I.LT.ILO )
      $            I = ILO - II
-               K = SCALE( I )
+               K = INT( SCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 40
                CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
@@ -252,7 +252,7 @@
      $            GO TO 50
                IF( I.LT.ILO )
      $            I = ILO - II
-               K = SCALE( I )
+               K = INT( SCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 50
                CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
diff --git a/lapack-netlib/SRC/cgees.f b/lapack-netlib/SRC/cgees.f
index 359fa2afe..71acfdba3 100644
--- a/lapack-netlib/SRC/cgees.f
+++ b/lapack-netlib/SRC/cgees.f
@@ -282,7 +282,7 @@
 *
             CALL CHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS,
      $             WORK, -1, IEVAL )
-            HSWORK = REAL( WORK( 1 ) )
+            HSWORK = INT( WORK( 1 ) )
 *
             IF( .NOT.WANTVS ) THEN
                MAXWRK = MAX( MAXWRK, HSWORK )
diff --git a/lapack-netlib/SRC/cgeesx.f b/lapack-netlib/SRC/cgeesx.f
index 1113563ba..782e36747 100644
--- a/lapack-netlib/SRC/cgeesx.f
+++ b/lapack-netlib/SRC/cgeesx.f
@@ -337,7 +337,7 @@
 *
             CALL CHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS,
      $             WORK, -1, IEVAL )
-            HSWORK = REAL( WORK( 1 ) )
+            HSWORK = INT( WORK( 1 ) )
 *
             IF( .NOT.WANTVS ) THEN
                MAXWRK = MAX( MAXWRK, HSWORK )
diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f
index 25ab81302..e37b25b6b 100644
--- a/lapack-netlib/SRC/cgejsv.f
+++ b/lapack-netlib/SRC/cgejsv.f
@@ -704,11 +704,11 @@
           IF ( LQUERY ) THEN 
               CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, 
      $             RDUMMY, IERR )
-              LWRK_CGEQP3 = REAL( CDUMMY(1) )
+              LWRK_CGEQP3 = INT( CDUMMY(1) )
               CALL CGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
-              LWRK_CGEQRF = REAL( CDUMMY(1) )
+              LWRK_CGEQRF = INT( CDUMMY(1) )
               CALL CGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
-              LWRK_CGELQF = REAL( CDUMMY(1) )
+              LWRK_CGELQF = INT( CDUMMY(1) )
           END IF
           MINWRK  = 2
           OPTWRK  = 2
@@ -724,7 +724,7 @@
               IF ( LQUERY ) THEN 
                   CALL CGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, 
      $                 LDV, CDUMMY, -1, RDUMMY, -1, IERR )
-                  LWRK_CGESVJ = REAL( CDUMMY(1) )
+                  LWRK_CGESVJ = INT( CDUMMY(1) )
                   IF ( ERREST ) THEN 
                       OPTWRK = MAX( N+LWRK_CGEQP3, N**2+LWCON, 
      $                              N+LWRK_CGEQRF, LWRK_CGESVJ )
@@ -760,10 +760,10 @@
              IF ( LQUERY ) THEN
                  CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
      $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
-                 LWRK_CGESVJ = REAL( CDUMMY(1) )
+                 LWRK_CGESVJ = INT( CDUMMY(1) )
                  CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
      $                V, LDV, CDUMMY, -1, IERR )
-                 LWRK_CUNMLQ = REAL( CDUMMY(1) )
+                 LWRK_CUNMLQ = INT( CDUMMY(1) )
                  IF ( ERREST ) THEN 
                  OPTWRK = MAX( N+LWRK_CGEQP3, LWCON, LWRK_CGESVJ, 
      $                         N+LWRK_CGELQF, 2*N+LWRK_CGEQRF,
@@ -799,10 +799,10 @@
              IF ( LQUERY ) THEN
                  CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
      $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
-                 LWRK_CGESVJ = REAL( CDUMMY(1) )
+                 LWRK_CGESVJ = INT( CDUMMY(1) )
                  CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
      $               LDU, CDUMMY, -1, IERR )
-                 LWRK_CUNMQRM = REAL( CDUMMY(1) )
+                 LWRK_CUNMQRM = INT( CDUMMY(1) )
                  IF ( ERREST ) THEN
                  OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, N+LWRK_CGEQRF,
      $                             LWRK_CGESVJ, LWRK_CUNMQRM )
@@ -861,26 +861,26 @@
              IF ( LQUERY ) THEN
                  CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
      $                LDU, CDUMMY, -1, IERR )
-                 LWRK_CUNMQRM = REAL( CDUMMY(1) )
+                 LWRK_CUNMQRM = INT( CDUMMY(1) )
                  CALL CUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U,
      $                LDU, CDUMMY, -1, IERR )
-                 LWRK_CUNMQR = REAL( CDUMMY(1) )
+                 LWRK_CUNMQR = INT( CDUMMY(1) )
                  IF ( .NOT. JRACC ) THEN
                      CALL CGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1,
      $                    RDUMMY, IERR )
-                     LWRK_CGEQP3N = REAL( CDUMMY(1) )
+                     LWRK_CGEQP3N = INT( CDUMMY(1) )
                      CALL CGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA,
      $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
-                     LWRK_CGESVJ = REAL( CDUMMY(1) )
+                     LWRK_CGESVJ = INT( CDUMMY(1) )
                      CALL CGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA,
      $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
-                     LWRK_CGESVJU = REAL( CDUMMY(1) )
+                     LWRK_CGESVJU = INT( CDUMMY(1) )
                      CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
      $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
-                     LWRK_CGESVJV = REAL( CDUMMY(1) )
+                     LWRK_CGESVJV = INT( CDUMMY(1) )
                      CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
      $                    V, LDV, CDUMMY, -1, IERR )
-                     LWRK_CUNMLQ = REAL( CDUMMY(1) )
+                     LWRK_CUNMLQ = INT( CDUMMY(1) )
                      IF ( ERREST ) THEN 
                        OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, 
      $                          2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, 
@@ -909,13 +909,13 @@
                  ELSE
                      CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
      $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
-                     LWRK_CGESVJV = REAL( CDUMMY(1) )
+                     LWRK_CGESVJV = INT( CDUMMY(1) )
                      CALL CUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY,
      $                    V, LDV, CDUMMY, -1, IERR )
-                     LWRK_CUNMQR = REAL( CDUMMY(1) )
+                     LWRK_CUNMQR = INT( CDUMMY(1) )
                      CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
      $                    LDU, CDUMMY, -1, IERR )
-                     LWRK_CUNMQRM = REAL( CDUMMY(1) )
+                     LWRK_CUNMQRM = INT( CDUMMY(1) )
                      IF ( ERREST ) THEN 
                         OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON,   
      $                           2*N+LWRK_CGEQRF, 2*N+N**2,  
diff --git a/lapack-netlib/SRC/cggbak.f b/lapack-netlib/SRC/cggbak.f
index e8ac34805..159449601 100644
--- a/lapack-netlib/SRC/cggbak.f
+++ b/lapack-netlib/SRC/cggbak.f
@@ -253,7 +253,7 @@
             IF( ILO.EQ.1 )
      $         GO TO 50
             DO 40 I = ILO - 1, 1, -1
-               K = RSCALE( I )
+               K = INT( RSCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 40
                CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
@@ -263,7 +263,7 @@
             IF( IHI.EQ.N )
      $         GO TO 70
             DO 60 I = IHI + 1, N
-               K = RSCALE( I )
+               K = INT( RSCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 60
                CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
@@ -277,7 +277,7 @@
             IF( ILO.EQ.1 )
      $         GO TO 90
             DO 80 I = ILO - 1, 1, -1
-               K = LSCALE( I )
+               K = INT( LSCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 80
                CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
@@ -287,7 +287,7 @@
             IF( IHI.EQ.N )
      $         GO TO 110
             DO 100 I = IHI + 1, N
-               K = LSCALE( I )
+               K = INT( LSCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 100
                CALL CSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
diff --git a/lapack-netlib/SRC/cggbal.f b/lapack-netlib/SRC/cggbal.f
index c7a232415..66ba7a881 100644
--- a/lapack-netlib/SRC/cggbal.f
+++ b/lapack-netlib/SRC/cggbal.f
@@ -535,7 +535,7 @@
          IRAB = ICAMAX( N-ILO+1, B( I, ILO ), LDB )
          RAB = MAX( RAB, ABS( B( I, IRAB+ILO-1 ) ) )
          LRAB = INT( LOG10( RAB+SFMIN ) / BASL+ONE )
-         IR = LSCALE( I ) + SIGN( HALF, LSCALE( I ) )
+         IR = INT( LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) )
          IR = MIN( MAX( IR, LSFMIN ), LSFMAX, LSFMAX-LRAB )
          LSCALE( I ) = SCLFAC**IR
          ICAB = ICAMAX( IHI, A( 1, I ), 1 )
@@ -543,7 +543,7 @@
          ICAB = ICAMAX( IHI, B( 1, I ), 1 )
          CAB = MAX( CAB, ABS( B( ICAB, I ) ) )
          LCAB = INT( LOG10( CAB+SFMIN ) / BASL+ONE )
-         JC = RSCALE( I ) + SIGN( HALF, RSCALE( I ) )
+         JC = INT( RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) )
          JC = MIN( MAX( JC, LSFMIN ), LSFMAX, LSFMAX-LCAB )
          RSCALE( I ) = SCLFAC**JC
   360 CONTINUE
diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f
index 3efca1e71..fb384b651 100644
--- a/lapack-netlib/SRC/cggglm.f
+++ b/lapack-netlib/SRC/cggglm.f
@@ -289,7 +289,7 @@
 *
       CALL CGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ),
      $             WORK( M+NP+1 ), LWORK-M-NP, INFO )
-      LOPT = REAL( WORK( M+NP+1 ) )
+      LOPT = INT( WORK( M+NP+1 ) )
 *
 *     Update left-hand-side vector d = Q**H*d = ( d1 ) M
 *                                               ( d2 ) N-M
diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f
index 76d7de4ce..1074b4828 100644
--- a/lapack-netlib/SRC/cgghd3.f
+++ b/lapack-netlib/SRC/cgghd3.f
@@ -511,7 +511,7 @@
 *
                IF( JJ.GT.0 ) THEN
                   DO I = JJ, 1, -1
-                     C = DBLE( A( J+1+I, J ) )
+                     C = REAL( A( J+1+I, J ) )
                      CALL CROT( IHI-TOP, A( TOP+1, J+I+1 ), 1,
      $                          A( TOP+1, J+I ), 1, C,
      $                          -CONJG( B( J+1+I, J ) ) )
diff --git a/lapack-netlib/SRC/cgglse.f b/lapack-netlib/SRC/cgglse.f
index 4785941db..cca20dfed 100644
--- a/lapack-netlib/SRC/cgglse.f
+++ b/lapack-netlib/SRC/cgglse.f
@@ -276,7 +276,7 @@
 *
       CALL CGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ),
      $             WORK( P+MN+1 ), LWORK-P-MN, INFO )
-      LOPT = REAL( WORK( P+MN+1 ) )
+      LOPT = INT( WORK( P+MN+1 ) )
 *
 *     Update c = Z**H *c = ( c1 ) N-P
 *                       ( c2 ) M+P-N
diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f
index febd9be8d..0185f4e0d 100644
--- a/lapack-netlib/SRC/cggqrf.f
+++ b/lapack-netlib/SRC/cggqrf.f
@@ -276,7 +276,7 @@
 *     QR factorization of N-by-M matrix A: A = Q*R
 *
       CALL CGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO )
-      LOPT = REAL( WORK( 1 ) )
+      LOPT = INT( WORK( 1 ) )
 *
 *     Update B := Q**H*B.
 *
diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f
index 0c708190c..4b7f43d52 100644
--- a/lapack-netlib/SRC/chegvd.f
+++ b/lapack-netlib/SRC/chegvd.f
@@ -360,9 +360,9 @@
       CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
       CALL CHEEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, LRWORK,
      $             IWORK, LIWORK, INFO )
-      LOPT = MAX( REAL( LOPT ), REAL( WORK( 1 ) ) )
-      LROPT = MAX( REAL( LROPT ), REAL( RWORK( 1 ) ) )
-      LIOPT = MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) )
+      LOPT = INT( MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) )
+      LROPT = INT( MAX( REAL( LROPT ), REAL( RWORK( 1 ) ) ) )
+      LIOPT = INT( MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) )
 *
       IF( WANTZ .AND. INFO.EQ.0 ) THEN
 *
diff --git a/lapack-netlib/SRC/chesv_rk.f b/lapack-netlib/SRC/chesv_rk.f
index a659c8e79..e123fa299 100644
--- a/lapack-netlib/SRC/chesv_rk.f
+++ b/lapack-netlib/SRC/chesv_rk.f
@@ -280,7 +280,7 @@
             LWKOPT = 1
          ELSE
             CALL CHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
-            LWKOPT = REAL( WORK(1) )
+            LWKOPT = INT( WORK( 1 ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f
index 754be31ed..65d08b783 100644
--- a/lapack-netlib/SRC/chpgvd.f
+++ b/lapack-netlib/SRC/chpgvd.f
@@ -335,9 +335,9 @@
       CALL CHPGST( ITYPE, UPLO, N, AP, BP, INFO )
       CALL CHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, RWORK,
      $             LRWORK, IWORK, LIWORK, INFO )
-      LWMIN = MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) )
-      LRWMIN = MAX( REAL( LRWMIN ), REAL( RWORK( 1 ) ) )
-      LIWMIN = MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) )
+      LWMIN = INT( MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) )
+      LRWMIN = INT( MAX( REAL( LRWMIN ), REAL( RWORK( 1 ) ) ) )
+      LIWMIN = INT( MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) )
 *
       IF( WANTZ ) THEN
 *
diff --git a/lapack-netlib/SRC/csysv.f b/lapack-netlib/SRC/csysv.f
index 6f175e381..4ddabf62f 100644
--- a/lapack-netlib/SRC/csysv.f
+++ b/lapack-netlib/SRC/csysv.f
@@ -223,7 +223,7 @@
             LWKOPT = 1
          ELSE
             CALL CSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-            LWKOPT = REAL( WORK(1) )
+            LWKOPT = INT( WORK( 1 ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/csysv_rk.f b/lapack-netlib/SRC/csysv_rk.f
index 793e39df5..ef5334dcd 100644
--- a/lapack-netlib/SRC/csysv_rk.f
+++ b/lapack-netlib/SRC/csysv_rk.f
@@ -280,7 +280,7 @@
             LWKOPT = 1
          ELSE
             CALL CSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
-            LWKOPT = REAL( WORK(1) )
+            LWKOPT = INT( WORK( 1 ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/csysv_rook.f b/lapack-netlib/SRC/csysv_rook.f
index daa9f27c4..aad594e21 100644
--- a/lapack-netlib/SRC/csysv_rook.f
+++ b/lapack-netlib/SRC/csysv_rook.f
@@ -256,7 +256,7 @@
             LWKOPT = 1
          ELSE
             CALL CSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-            LWKOPT = REAL( WORK(1) )
+            LWKOPT = INT( WORK( 1 ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f
index c973d0b0a..a31a53d79 100644
--- a/lapack-netlib/SRC/cungbr.f
+++ b/lapack-netlib/SRC/cungbr.f
@@ -233,7 +233,7 @@
                END IF
             END IF
          END IF
-         LWKOPT = REAL( WORK( 1 ) )
+         LWKOPT = INT( WORK( 1 ) )
          LWKOPT = MAX (LWKOPT, MN)
       END IF
 *

From 08bc43c73d43ab0f20595b705c1b07a2ddabf41e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 15:04:30 +0100
Subject: [PATCH 109/154] Define type conversions explicitly (Reference-LAPACK
 PR 703)

---
 lapack-netlib/SRC/dgebak.f     |  4 ++--
 lapack-netlib/SRC/dgees.f      |  2 +-
 lapack-netlib/SRC/dgeesx.f     |  2 +-
 lapack-netlib/SRC/dgelss.f     | 26 +++++++++++++-------------
 lapack-netlib/SRC/dggglm.f     |  2 +-
 lapack-netlib/SRC/dgglse.f     |  2 +-
 lapack-netlib/SRC/dggqrf.f     |  2 +-
 lapack-netlib/SRC/dggrqf.f     |  2 +-
 lapack-netlib/SRC/dlag2s.f     |  9 ++++++---
 lapack-netlib/SRC/dlat2s.f     |  7 +++++--
 lapack-netlib/SRC/dorgbr.f     |  2 +-
 lapack-netlib/SRC/dspgvd.f     |  4 ++--
 lapack-netlib/SRC/dsygvd.f     |  4 ++--
 lapack-netlib/SRC/dsysv.f      |  2 +-
 lapack-netlib/SRC/dsysv_rk.f   |  2 +-
 lapack-netlib/SRC/dsysv_rook.f |  2 +-
 16 files changed, 40 insertions(+), 34 deletions(-)

diff --git a/lapack-netlib/SRC/dgebak.f b/lapack-netlib/SRC/dgebak.f
index e978d7af2..9c086794a 100644
--- a/lapack-netlib/SRC/dgebak.f
+++ b/lapack-netlib/SRC/dgebak.f
@@ -236,7 +236,7 @@
      $            GO TO 40
                IF( I.LT.ILO )
      $            I = ILO - II
-               K = SCALE( I )
+               K = INT( SCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 40
                CALL DSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
@@ -250,7 +250,7 @@
      $            GO TO 50
                IF( I.LT.ILO )
      $            I = ILO - II
-               K = SCALE( I )
+               K = INT( SCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 50
                CALL DSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
diff --git a/lapack-netlib/SRC/dgees.f b/lapack-netlib/SRC/dgees.f
index 82b9d6ee4..24739b1cf 100644
--- a/lapack-netlib/SRC/dgees.f
+++ b/lapack-netlib/SRC/dgees.f
@@ -302,7 +302,7 @@
 *
             CALL DHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS,
      $             WORK, -1, IEVAL )
-            HSWORK = WORK( 1 )
+            HSWORK = INT( WORK( 1 ) )
 *
             IF( .NOT.WANTVS ) THEN
                MAXWRK = MAX( MAXWRK, N + HSWORK )
diff --git a/lapack-netlib/SRC/dgeesx.f b/lapack-netlib/SRC/dgeesx.f
index 08fbb6468..f3677fcb3 100644
--- a/lapack-netlib/SRC/dgeesx.f
+++ b/lapack-netlib/SRC/dgeesx.f
@@ -382,7 +382,7 @@
 *
             CALL DHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS,
      $             WORK, -1, IEVAL )
-            HSWORK = WORK( 1 )
+            HSWORK = INT( WORK( 1 ) )
 *
             IF( .NOT.WANTVS ) THEN
                MAXWRK = MAX( MAXWRK, N + HSWORK )
diff --git a/lapack-netlib/SRC/dgelss.f b/lapack-netlib/SRC/dgelss.f
index 8ed703fcf..c4190f2e0 100644
--- a/lapack-netlib/SRC/dgelss.f
+++ b/lapack-netlib/SRC/dgelss.f
@@ -254,11 +254,11 @@
 *
 *              Compute space needed for DGEQRF
                CALL DGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO )
-               LWORK_DGEQRF=DUM(1)
+               LWORK_DGEQRF = INT( DUM(1) )
 *              Compute space needed for DORMQR
                CALL DORMQR( 'L', 'T', M, NRHS, N, A, LDA, DUM(1), B,
      $                   LDB, DUM(1), -1, INFO )
-               LWORK_DORMQR=DUM(1)
+               LWORK_DORMQR = INT( DUM(1) )
                MM = N
                MAXWRK = MAX( MAXWRK, N + LWORK_DGEQRF )
                MAXWRK = MAX( MAXWRK, N + LWORK_DORMQR )
@@ -273,15 +273,15 @@
 *              Compute space needed for DGEBRD
                CALL DGEBRD( MM, N, A, LDA, S, DUM(1), DUM(1),
      $                      DUM(1), DUM(1), -1, INFO )
-               LWORK_DGEBRD=DUM(1)
+               LWORK_DGEBRD = INT( DUM(1) )
 *              Compute space needed for DORMBR
                CALL DORMBR( 'Q', 'L', 'T', MM, NRHS, N, A, LDA, DUM(1),
      $                B, LDB, DUM(1), -1, INFO )
-               LWORK_DORMBR=DUM(1)
+               LWORK_DORMBR = INT( DUM(1) )
 *              Compute space needed for DORGBR
                CALL DORGBR( 'P', N, N, N, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-               LWORK_DORGBR=DUM(1)
+               LWORK_DORGBR = INT( DUM(1) )
 *              Compute total workspace needed
                MAXWRK = MAX( MAXWRK, 3*N + LWORK_DGEBRD )
                MAXWRK = MAX( MAXWRK, 3*N + LWORK_DORMBR )
@@ -305,23 +305,23 @@
 *                 Compute space needed for DGELQF
                   CALL DGELQF( M, N, A, LDA, DUM(1), DUM(1),
      $                -1, INFO )
-                  LWORK_DGELQF=DUM(1)
+                  LWORK_DGELQF = INT( DUM(1) )
 *                 Compute space needed for DGEBRD
                   CALL DGEBRD( M, M, A, LDA, S, DUM(1), DUM(1),
      $                      DUM(1), DUM(1), -1, INFO )
-                  LWORK_DGEBRD=DUM(1)
+                  LWORK_DGEBRD = INT( DUM(1) )
 *                 Compute space needed for DORMBR
                   CALL DORMBR( 'Q', 'L', 'T', M, NRHS, N, A, LDA,
      $                DUM(1), B, LDB, DUM(1), -1, INFO )
-                  LWORK_DORMBR=DUM(1)
+                  LWORK_DORMBR = INT( DUM(1) )
 *                 Compute space needed for DORGBR
                   CALL DORGBR( 'P', M, M, M, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-                  LWORK_DORGBR=DUM(1)
+                  LWORK_DORGBR = INT( DUM(1) )
 *                 Compute space needed for DORMLQ
                   CALL DORMLQ( 'L', 'T', N, NRHS, M, A, LDA, DUM(1),
      $                 B, LDB, DUM(1), -1, INFO )
-                  LWORK_DORMLQ=DUM(1)
+                  LWORK_DORMLQ = INT( DUM(1) )
 *                 Compute total workspace needed
                   MAXWRK = M + LWORK_DGELQF
                   MAXWRK = MAX( MAXWRK, M*M + 4*M + LWORK_DGEBRD )
@@ -341,15 +341,15 @@
 *                 Compute space needed for DGEBRD
                   CALL DGEBRD( M, N, A, LDA, S, DUM(1), DUM(1),
      $                      DUM(1), DUM(1), -1, INFO )
-                  LWORK_DGEBRD=DUM(1)
+                  LWORK_DGEBRD = INT( DUM(1) )
 *                 Compute space needed for DORMBR
                   CALL DORMBR( 'Q', 'L', 'T', M, NRHS, M, A, LDA,
      $                DUM(1), B, LDB, DUM(1), -1, INFO )
-                  LWORK_DORMBR=DUM(1)
+                  LWORK_DORMBR = INT( DUM(1) )
 *                 Compute space needed for DORGBR
                   CALL DORGBR( 'P', M, N, M, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-                  LWORK_DORGBR=DUM(1)
+                  LWORK_DORGBR = INT( DUM(1) )
                   MAXWRK = 3*M + LWORK_DGEBRD
                   MAXWRK = MAX( MAXWRK, 3*M + LWORK_DORMBR )
                   MAXWRK = MAX( MAXWRK, 3*M + LWORK_DORGBR )
diff --git a/lapack-netlib/SRC/dggglm.f b/lapack-netlib/SRC/dggglm.f
index d43785d32..ae0f0e908 100644
--- a/lapack-netlib/SRC/dggglm.f
+++ b/lapack-netlib/SRC/dggglm.f
@@ -288,7 +288,7 @@
 *
       CALL DGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ),
      $             WORK( M+NP+1 ), LWORK-M-NP, INFO )
-      LOPT = WORK( M+NP+1 )
+      LOPT = INT( WORK( M+NP+1 ) )
 *
 *     Update left-hand-side vector d = Q**T*d = ( d1 ) M
 *                                               ( d2 ) N-M
diff --git a/lapack-netlib/SRC/dgglse.f b/lapack-netlib/SRC/dgglse.f
index 2fd17bbcb..28aeaf6e7 100644
--- a/lapack-netlib/SRC/dgglse.f
+++ b/lapack-netlib/SRC/dgglse.f
@@ -276,7 +276,7 @@
 *
       CALL DGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ),
      $             WORK( P+MN+1 ), LWORK-P-MN, INFO )
-      LOPT = WORK( P+MN+1 )
+      LOPT = INT( WORK( P+MN+1 ) )
 *
 *     Update c = Z**T *c = ( c1 ) N-P
 *                          ( c2 ) M+P-N
diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f
index 617af274f..39d27a5c9 100644
--- a/lapack-netlib/SRC/dggqrf.f
+++ b/lapack-netlib/SRC/dggqrf.f
@@ -276,7 +276,7 @@
 *     QR factorization of N-by-M matrix A: A = Q*R
 *
       CALL DGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO )
-      LOPT = WORK( 1 )
+      LOPT = INT( WORK( 1 ) )
 *
 *     Update B := Q**T*B.
 *
diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f
index 07f8752d8..ddf4104c5 100644
--- a/lapack-netlib/SRC/dggrqf.f
+++ b/lapack-netlib/SRC/dggrqf.f
@@ -275,7 +275,7 @@
 *     RQ factorization of M-by-N matrix A: A = R*Q
 *
       CALL DGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO )
-      LOPT = WORK( 1 )
+      LOPT = INT( WORK( 1 ) )
 *
 *     Update B := B*Q**T
 *
diff --git a/lapack-netlib/SRC/dlag2s.f b/lapack-netlib/SRC/dlag2s.f
index e5a930223..9e6dead49 100644
--- a/lapack-netlib/SRC/dlag2s.f
+++ b/lapack-netlib/SRC/dlag2s.f
@@ -34,8 +34,8 @@
 *>
 *> \verbatim
 *>
-*> DLAG2S converts a DOUBLE PRECISION matrix, SA, to a SINGLE
-*> PRECISION matrix, A.
+*> DLAG2S converts a DOUBLE PRECISION matrix, A, to a SINGLE
+*> PRECISION matrix, SA.
 *>
 *> RMAX is the overflow for the SINGLE PRECISION arithmetic
 *> DLAG2S checks that all the entries of A are between -RMAX and
@@ -128,6 +128,9 @@
       REAL               SLAMCH
       EXTERNAL           SLAMCH
 *     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL
+*     ..
 *     .. Executable Statements ..
 *
       RMAX = SLAMCH( 'O' )
@@ -137,7 +140,7 @@
                INFO = 1
                GO TO 30
             END IF
-            SA( I, J ) = A( I, J )
+            SA( I, J ) = REAL( A( I, J ) )
    10    CONTINUE
    20 CONTINUE
       INFO = 0
diff --git a/lapack-netlib/SRC/dlat2s.f b/lapack-netlib/SRC/dlat2s.f
index 3d00fe0a3..c926e9930 100644
--- a/lapack-netlib/SRC/dlat2s.f
+++ b/lapack-netlib/SRC/dlat2s.f
@@ -134,6 +134,9 @@
       LOGICAL            LSAME
       EXTERNAL           SLAMCH, LSAME
 *     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL
+*     ..
 *     .. Executable Statements ..
 *
       RMAX = SLAMCH( 'O' )
@@ -146,7 +149,7 @@
                   INFO = 1
                   GO TO 50
                END IF
-               SA( I, J ) = A( I, J )
+               SA( I, J ) = REAL( A( I, J ) )
    10       CONTINUE
    20    CONTINUE
       ELSE
@@ -157,7 +160,7 @@
                   INFO = 1
                   GO TO 50
                END IF
-               SA( I, J ) = A( I, J )
+               SA( I, J ) = REAL( A( I, J ) )
    30       CONTINUE
    40    CONTINUE
       END IF
diff --git a/lapack-netlib/SRC/dorgbr.f b/lapack-netlib/SRC/dorgbr.f
index 1b242ff97..7dfd03961 100644
--- a/lapack-netlib/SRC/dorgbr.f
+++ b/lapack-netlib/SRC/dorgbr.f
@@ -232,7 +232,7 @@
                END IF
             END IF
          END IF
-         LWKOPT = WORK( 1 )
+         LWKOPT = INT( WORK( 1 ) )
          LWKOPT = MAX (LWKOPT, MN)
       END IF
 *
diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f
index 556326388..df215ae1a 100644
--- a/lapack-netlib/SRC/dspgvd.f
+++ b/lapack-netlib/SRC/dspgvd.f
@@ -307,8 +307,8 @@
       CALL DSPGST( ITYPE, UPLO, N, AP, BP, INFO )
       CALL DSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, IWORK,
      $             LIWORK, INFO )
-      LWMIN = MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) )
-      LIWMIN = MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) )
+      LWMIN = INT( MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) )
+      LIWMIN = INT( MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) )
 *
       IF( WANTZ ) THEN
 *
diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f
index 61134bedc..3b38665a7 100644
--- a/lapack-netlib/SRC/dsygvd.f
+++ b/lapack-netlib/SRC/dsygvd.f
@@ -330,8 +330,8 @@
       CALL DSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
       CALL DSYEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, IWORK, LIWORK,
      $             INFO )
-      LOPT = MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) )
-      LIOPT = MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) )
+      LOPT = INT( MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) )
+      LIOPT = INT( MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) )
 *
       IF( WANTZ .AND. INFO.EQ.0 ) THEN
 *
diff --git a/lapack-netlib/SRC/dsysv.f b/lapack-netlib/SRC/dsysv.f
index a6305e13c..ed6629ad9 100644
--- a/lapack-netlib/SRC/dsysv.f
+++ b/lapack-netlib/SRC/dsysv.f
@@ -223,7 +223,7 @@
             LWKOPT = 1
          ELSE
             CALL DSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-            LWKOPT = WORK(1)
+            LWKOPT = INT( WORK( 1 ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/dsysv_rk.f b/lapack-netlib/SRC/dsysv_rk.f
index 05d8f7d3f..db8fd36dd 100644
--- a/lapack-netlib/SRC/dsysv_rk.f
+++ b/lapack-netlib/SRC/dsysv_rk.f
@@ -280,7 +280,7 @@
             LWKOPT = 1
          ELSE
             CALL DSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
-            LWKOPT = WORK(1)
+            LWKOPT = INT( WORK( 1 ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/dsysv_rook.f b/lapack-netlib/SRC/dsysv_rook.f
index 6ebb52eae..85f293309 100644
--- a/lapack-netlib/SRC/dsysv_rook.f
+++ b/lapack-netlib/SRC/dsysv_rook.f
@@ -256,7 +256,7 @@
             LWKOPT = 1
          ELSE
             CALL DSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-            LWKOPT = WORK(1)
+            LWKOPT = INT( WORK( 1 ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF

From e9b0f5a3648572db51b810afd8e0cb42993175e6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 15:11:05 +0100
Subject: [PATCH 110/154] Define type conversions explicitly (Reference-LAPACK
 PR 703)

---
 lapack-netlib/SRC/sgebak.f     | 4 ++--
 lapack-netlib/SRC/sgees.f      | 2 +-
 lapack-netlib/SRC/sgeesx.f     | 2 +-
 lapack-netlib/SRC/sggbak.f     | 8 ++++----
 lapack-netlib/SRC/sggbal.f     | 4 ++--
 lapack-netlib/SRC/sggglm.f     | 2 +-
 lapack-netlib/SRC/sgglse.f     | 2 +-
 lapack-netlib/SRC/sggqrf.f     | 2 +-
 lapack-netlib/SRC/sggrqf.f     | 2 +-
 lapack-netlib/SRC/sorgbr.f     | 2 +-
 lapack-netlib/SRC/sspgvd.f     | 4 ++--
 lapack-netlib/SRC/ssygvd.f     | 4 ++--
 lapack-netlib/SRC/ssysv.f      | 2 +-
 lapack-netlib/SRC/ssysv_rk.f   | 2 +-
 lapack-netlib/SRC/ssysv_rook.f | 2 +-
 15 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/lapack-netlib/SRC/sgebak.f b/lapack-netlib/SRC/sgebak.f
index b51b611a9..abb7809a3 100644
--- a/lapack-netlib/SRC/sgebak.f
+++ b/lapack-netlib/SRC/sgebak.f
@@ -236,7 +236,7 @@
      $            GO TO 40
                IF( I.LT.ILO )
      $            I = ILO - II
-               K = SCALE( I )
+               K = INT( SCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 40
                CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
@@ -250,7 +250,7 @@
      $            GO TO 50
                IF( I.LT.ILO )
      $            I = ILO - II
-               K = SCALE( I )
+               K = INT( SCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 50
                CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
diff --git a/lapack-netlib/SRC/sgees.f b/lapack-netlib/SRC/sgees.f
index d40503f89..6febd549c 100644
--- a/lapack-netlib/SRC/sgees.f
+++ b/lapack-netlib/SRC/sgees.f
@@ -302,7 +302,7 @@
 *
             CALL SHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS,
      $             WORK, -1, IEVAL )
-            HSWORK = WORK( 1 )
+            HSWORK = INT( WORK( 1 ) )
 *
             IF( .NOT.WANTVS ) THEN
                MAXWRK = MAX( MAXWRK, N + HSWORK )
diff --git a/lapack-netlib/SRC/sgeesx.f b/lapack-netlib/SRC/sgeesx.f
index 27c4338d4..6810fe7c8 100644
--- a/lapack-netlib/SRC/sgeesx.f
+++ b/lapack-netlib/SRC/sgeesx.f
@@ -382,7 +382,7 @@
 *
             CALL SHSEQR( 'S', JOBVS, N, 1, N, A, LDA, WR, WI, VS, LDVS,
      $             WORK, -1, IEVAL )
-            HSWORK = WORK( 1 )
+            HSWORK = INT( WORK( 1 ) )
 *
             IF( .NOT.WANTVS ) THEN
                MAXWRK = MAX( MAXWRK, N + HSWORK )
diff --git a/lapack-netlib/SRC/sggbak.f b/lapack-netlib/SRC/sggbak.f
index bb7f36011..8a796fdb1 100644
--- a/lapack-netlib/SRC/sggbak.f
+++ b/lapack-netlib/SRC/sggbak.f
@@ -252,7 +252,7 @@
      $         GO TO 50
 *
             DO 40 I = ILO - 1, 1, -1
-               K = RSCALE( I )
+               K = INT( RSCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 40
                CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
@@ -262,7 +262,7 @@
             IF( IHI.EQ.N )
      $         GO TO 70
             DO 60 I = IHI + 1, N
-               K = RSCALE( I )
+               K = INT( RSCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 60
                CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
@@ -276,7 +276,7 @@
             IF( ILO.EQ.1 )
      $         GO TO 90
             DO 80 I = ILO - 1, 1, -1
-               K = LSCALE( I )
+               K = INT( LSCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 80
                CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
@@ -286,7 +286,7 @@
             IF( IHI.EQ.N )
      $         GO TO 110
             DO 100 I = IHI + 1, N
-               K = LSCALE( I )
+               K = INT( LSCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 100
                CALL SSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
diff --git a/lapack-netlib/SRC/sggbal.f b/lapack-netlib/SRC/sggbal.f
index 6cfdbcdba..d7a8ef16c 100644
--- a/lapack-netlib/SRC/sggbal.f
+++ b/lapack-netlib/SRC/sggbal.f
@@ -522,7 +522,7 @@
          IRAB = ISAMAX( N-ILO+1, B( I, ILO ), LDB )
          RAB = MAX( RAB, ABS( B( I, IRAB+ILO-1 ) ) )
          LRAB = INT( LOG10( RAB+SFMIN ) / BASL+ONE )
-         IR = LSCALE( I ) + SIGN( HALF, LSCALE( I ) )
+         IR = INT( LSCALE( I ) + SIGN( HALF, LSCALE( I ) ) )
          IR = MIN( MAX( IR, LSFMIN ), LSFMAX, LSFMAX-LRAB )
          LSCALE( I ) = SCLFAC**IR
          ICAB = ISAMAX( IHI, A( 1, I ), 1 )
@@ -530,7 +530,7 @@
          ICAB = ISAMAX( IHI, B( 1, I ), 1 )
          CAB = MAX( CAB, ABS( B( ICAB, I ) ) )
          LCAB = INT( LOG10( CAB+SFMIN ) / BASL+ONE )
-         JC = RSCALE( I ) + SIGN( HALF, RSCALE( I ) )
+         JC = INT( RSCALE( I ) + SIGN( HALF, RSCALE( I ) ) )
          JC = MIN( MAX( JC, LSFMIN ), LSFMAX, LSFMAX-LCAB )
          RSCALE( I ) = SCLFAC**JC
   360 CONTINUE
diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f
index bbd032beb..56b4dba52 100644
--- a/lapack-netlib/SRC/sggglm.f
+++ b/lapack-netlib/SRC/sggglm.f
@@ -288,7 +288,7 @@
 *
       CALL SGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ),
      $             WORK( M+NP+1 ), LWORK-M-NP, INFO )
-      LOPT = WORK( M+NP+1 )
+      LOPT = INT( WORK( M+NP+1 ) )
 *
 *     Update left-hand-side vector d = Q**T*d = ( d1 ) M
 *                                               ( d2 ) N-M
diff --git a/lapack-netlib/SRC/sgglse.f b/lapack-netlib/SRC/sgglse.f
index 7ef8782b0..59addc3f4 100644
--- a/lapack-netlib/SRC/sgglse.f
+++ b/lapack-netlib/SRC/sgglse.f
@@ -276,7 +276,7 @@
 *
       CALL SGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ),
      $             WORK( P+MN+1 ), LWORK-P-MN, INFO )
-      LOPT = WORK( P+MN+1 )
+      LOPT = INT( WORK( P+MN+1 ) )
 *
 *     Update c = Z**T *c = ( c1 ) N-P
 *                          ( c2 ) M+P-N
diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f
index c57b16a56..59b498da5 100644
--- a/lapack-netlib/SRC/sggqrf.f
+++ b/lapack-netlib/SRC/sggqrf.f
@@ -276,7 +276,7 @@
 *     QR factorization of N-by-M matrix A: A = Q*R
 *
       CALL SGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO )
-      LOPT = WORK( 1 )
+      LOPT = INT( WORK( 1 ) )
 *
 *     Update B := Q**T*B.
 *
diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f
index c4a78c347..8b7d4786a 100644
--- a/lapack-netlib/SRC/sggrqf.f
+++ b/lapack-netlib/SRC/sggrqf.f
@@ -275,7 +275,7 @@
 *     RQ factorization of M-by-N matrix A: A = R*Q
 *
       CALL SGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO )
-      LOPT = WORK( 1 )
+      LOPT = INT( WORK( 1 ) )
 *
 *     Update B := B*Q**T
 *
diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f
index 8f15523d4..b1a5c03a2 100644
--- a/lapack-netlib/SRC/sorgbr.f
+++ b/lapack-netlib/SRC/sorgbr.f
@@ -232,7 +232,7 @@
                END IF
             END IF
          END IF
-         LWKOPT = WORK( 1 )
+         LWKOPT = INT( WORK( 1 ) )
          LWKOPT = MAX (LWKOPT, MN)
       END IF
 *
diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f
index 9db8de08c..73862ed1b 100644
--- a/lapack-netlib/SRC/sspgvd.f
+++ b/lapack-netlib/SRC/sspgvd.f
@@ -307,8 +307,8 @@
       CALL SSPGST( ITYPE, UPLO, N, AP, BP, INFO )
       CALL SSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, IWORK,
      $             LIWORK, INFO )
-      LWMIN = MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) )
-      LIWMIN = MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) )
+      LWMIN = INT( MAX( REAL( LWMIN ), REAL( WORK( 1 ) ) ) )
+      LIWMIN = INT( MAX( REAL( LIWMIN ), REAL( IWORK( 1 ) ) ) )
 *
       IF( WANTZ ) THEN
 *
diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f
index 9002df237..7c7e0de01 100644
--- a/lapack-netlib/SRC/ssygvd.f
+++ b/lapack-netlib/SRC/ssygvd.f
@@ -330,8 +330,8 @@
       CALL SSYGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
       CALL SSYEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, IWORK, LIWORK,
      $             INFO )
-      LOPT = MAX( REAL( LOPT ), REAL( WORK( 1 ) ) )
-      LIOPT = MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) )
+      LOPT = INT( MAX( REAL( LOPT ), REAL( WORK( 1 ) ) ) )
+      LIOPT = INT( MAX( REAL( LIOPT ), REAL( IWORK( 1 ) ) ) )
 *
       IF( WANTZ .AND. INFO.EQ.0 ) THEN
 *
diff --git a/lapack-netlib/SRC/ssysv.f b/lapack-netlib/SRC/ssysv.f
index 5f4062e9a..06a42dfb7 100644
--- a/lapack-netlib/SRC/ssysv.f
+++ b/lapack-netlib/SRC/ssysv.f
@@ -223,7 +223,7 @@
             LWKOPT = 1
          ELSE
             CALL SSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-            LWKOPT = WORK(1)
+            LWKOPT = INT( WORK( 1 ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/ssysv_rk.f b/lapack-netlib/SRC/ssysv_rk.f
index 9e0487623..9a7dfa4bb 100644
--- a/lapack-netlib/SRC/ssysv_rk.f
+++ b/lapack-netlib/SRC/ssysv_rk.f
@@ -280,7 +280,7 @@
             LWKOPT = 1
          ELSE
             CALL SSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
-            LWKOPT = WORK(1)
+            LWKOPT = INT( WORK( 1 ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/ssysv_rook.f b/lapack-netlib/SRC/ssysv_rook.f
index b4da1101c..fb7ba8c53 100644
--- a/lapack-netlib/SRC/ssysv_rook.f
+++ b/lapack-netlib/SRC/ssysv_rook.f
@@ -256,7 +256,7 @@
             LWKOPT = 1
          ELSE
             CALL SSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-            LWKOPT = WORK(1)
+            LWKOPT = INT( WORK( 1 ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF

From 4e60737c2d914de2385c66dfb097b8d3d4d73d10 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 15:22:46 +0100
Subject: [PATCH 111/154] Define type conversions explicitly (Reference-LAPACK
 PR 703)

---
 lapack-netlib/SRC/zgebak.f     |  4 ++--
 lapack-netlib/SRC/zgees.f      |  2 +-
 lapack-netlib/SRC/zgeesx.f     |  2 +-
 lapack-netlib/SRC/zgejsv.f     | 36 +++++++++++++++++-----------------
 lapack-netlib/SRC/zggglm.f     |  2 +-
 lapack-netlib/SRC/zgglse.f     |  2 +-
 lapack-netlib/SRC/zggqrf.f     |  2 +-
 lapack-netlib/SRC/zggrqf.f     |  2 +-
 lapack-netlib/SRC/zhegvd.f     |  6 +++---
 lapack-netlib/SRC/zhesv_rk.f   |  2 +-
 lapack-netlib/SRC/zhpgvd.f     |  6 +++---
 lapack-netlib/SRC/zlag2c.f     |  4 ++--
 lapack-netlib/SRC/zlaic1.f     |  4 ++--
 lapack-netlib/SRC/zlat2c.f     |  6 +++---
 lapack-netlib/SRC/zsysv.f      |  2 +-
 lapack-netlib/SRC/zsysv_rk.f   |  2 +-
 lapack-netlib/SRC/zsysv_rook.f |  2 +-
 lapack-netlib/SRC/zungbr.f     |  2 +-
 18 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/lapack-netlib/SRC/zgebak.f b/lapack-netlib/SRC/zgebak.f
index 9ec610efb..9a0f65a43 100644
--- a/lapack-netlib/SRC/zgebak.f
+++ b/lapack-netlib/SRC/zgebak.f
@@ -238,7 +238,7 @@
      $            GO TO 40
                IF( I.LT.ILO )
      $            I = ILO - II
-               K = SCALE( I )
+               K = INT( SCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 40
                CALL ZSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
@@ -252,7 +252,7 @@
      $            GO TO 50
                IF( I.LT.ILO )
      $            I = ILO - II
-               K = SCALE( I )
+               K = INT( SCALE( I ) )
                IF( K.EQ.I )
      $            GO TO 50
                CALL ZSWAP( M, V( I, 1 ), LDV, V( K, 1 ), LDV )
diff --git a/lapack-netlib/SRC/zgees.f b/lapack-netlib/SRC/zgees.f
index 40fe78d34..d673087bf 100644
--- a/lapack-netlib/SRC/zgees.f
+++ b/lapack-netlib/SRC/zgees.f
@@ -282,7 +282,7 @@
 *
             CALL ZHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS,
      $             WORK, -1, IEVAL )
-            HSWORK = DBLE( WORK( 1 ) )
+            HSWORK = INT( WORK( 1 ) )
 *
             IF( .NOT.WANTVS ) THEN
                MAXWRK = MAX( MAXWRK, HSWORK )
diff --git a/lapack-netlib/SRC/zgeesx.f b/lapack-netlib/SRC/zgeesx.f
index ca4f5c913..bdd741b11 100644
--- a/lapack-netlib/SRC/zgeesx.f
+++ b/lapack-netlib/SRC/zgeesx.f
@@ -337,7 +337,7 @@
 *
             CALL ZHSEQR( 'S', JOBVS, N, 1, N, A, LDA, W, VS, LDVS,
      $             WORK, -1, IEVAL )
-            HSWORK = DBLE( WORK( 1 ) )
+            HSWORK = INT( WORK( 1 ) )
 *
             IF( .NOT.WANTVS ) THEN
                MAXWRK = MAX( MAXWRK, HSWORK )
diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f
index 0c2226f9f..d1106696c 100644
--- a/lapack-netlib/SRC/zgejsv.f
+++ b/lapack-netlib/SRC/zgejsv.f
@@ -707,11 +707,11 @@
           IF ( LQUERY ) THEN 
               CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, 
      $             RDUMMY, IERR )
-              LWRK_ZGEQP3 = DBLE( CDUMMY(1) )
+              LWRK_ZGEQP3 = INT( CDUMMY(1) )
               CALL ZGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
-              LWRK_ZGEQRF = DBLE( CDUMMY(1) )
+              LWRK_ZGEQRF = INT( CDUMMY(1) )
               CALL ZGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
-              LWRK_ZGELQF = DBLE( CDUMMY(1) )
+              LWRK_ZGELQF = INT( CDUMMY(1) )
           END IF
           MINWRK  = 2
           OPTWRK  = 2
@@ -727,7 +727,7 @@
               IF ( LQUERY ) THEN 
                   CALL ZGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, 
      $                 LDV, CDUMMY, -1, RDUMMY, -1, IERR )
-                  LWRK_ZGESVJ = DBLE( CDUMMY(1) )
+                  LWRK_ZGESVJ = INT( CDUMMY(1) )
                   IF ( ERREST ) THEN 
                       OPTWRK = MAX( N+LWRK_ZGEQP3, N**2+LWCON, 
      $                              N+LWRK_ZGEQRF, LWRK_ZGESVJ )
@@ -763,10 +763,10 @@
              IF ( LQUERY ) THEN
                  CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
      $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
-                 LWRK_ZGESVJ = DBLE( CDUMMY(1) )
+                 LWRK_ZGESVJ = INT( CDUMMY(1) )
                  CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
      $                V, LDV, CDUMMY, -1, IERR )
-                 LWRK_ZUNMLQ = DBLE( CDUMMY(1) )
+                 LWRK_ZUNMLQ = INT( CDUMMY(1) )
                  IF ( ERREST ) THEN 
                  OPTWRK = MAX( N+LWRK_ZGEQP3, LWCON, LWRK_ZGESVJ, 
      $                         N+LWRK_ZGELQF, 2*N+LWRK_ZGEQRF,
@@ -802,10 +802,10 @@
              IF ( LQUERY ) THEN
                  CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
      $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
-                 LWRK_ZGESVJ = DBLE( CDUMMY(1) )
+                 LWRK_ZGESVJ = INT( CDUMMY(1) )
                  CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
      $               LDU, CDUMMY, -1, IERR )
-                 LWRK_ZUNMQRM = DBLE( CDUMMY(1) )
+                 LWRK_ZUNMQRM = INT( CDUMMY(1) )
                  IF ( ERREST ) THEN
                  OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, N+LWRK_ZGEQRF,
      $                             LWRK_ZGESVJ, LWRK_ZUNMQRM )
@@ -864,26 +864,26 @@
              IF ( LQUERY ) THEN
                  CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
      $                LDU, CDUMMY, -1, IERR )
-                 LWRK_ZUNMQRM = DBLE( CDUMMY(1) )
+                 LWRK_ZUNMQRM = INT( CDUMMY(1) )
                  CALL ZUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U,
      $                LDU, CDUMMY, -1, IERR )
-                 LWRK_ZUNMQR = DBLE( CDUMMY(1) )
+                 LWRK_ZUNMQR = INT( CDUMMY(1) )
                  IF ( .NOT. JRACC ) THEN
                      CALL ZGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1,
      $                    RDUMMY, IERR )
-                     LWRK_ZGEQP3N = DBLE( CDUMMY(1) )
+                     LWRK_ZGEQP3N = INT( CDUMMY(1) )
                      CALL ZGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA,
      $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
-                     LWRK_ZGESVJ = DBLE( CDUMMY(1) )
+                     LWRK_ZGESVJ = INT( CDUMMY(1) )
                      CALL ZGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA,
      $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
-                     LWRK_ZGESVJU = DBLE( CDUMMY(1) )
+                     LWRK_ZGESVJU = INT( CDUMMY(1) )
                      CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
      $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
-                     LWRK_ZGESVJV = DBLE( CDUMMY(1) )
+                     LWRK_ZGESVJV = INT( CDUMMY(1) )
                      CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
      $                    V, LDV, CDUMMY, -1, IERR )
-                     LWRK_ZUNMLQ = DBLE( CDUMMY(1) )
+                     LWRK_ZUNMLQ = INT( CDUMMY(1) )
                      IF ( ERREST ) THEN 
                        OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, 
      $                          2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, 
@@ -912,13 +912,13 @@
                  ELSE
                      CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
      $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
-                     LWRK_ZGESVJV = DBLE( CDUMMY(1) )
+                     LWRK_ZGESVJV = INT( CDUMMY(1) )
                      CALL ZUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY,
      $                    V, LDV, CDUMMY, -1, IERR )
-                     LWRK_ZUNMQR = DBLE( CDUMMY(1) )
+                     LWRK_ZUNMQR = INT( CDUMMY(1) )
                      CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
      $                    LDU, CDUMMY, -1, IERR )
-                     LWRK_ZUNMQRM = DBLE( CDUMMY(1) )
+                     LWRK_ZUNMQRM = INT( CDUMMY(1) )
                      IF ( ERREST ) THEN 
                         OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON,   
      $                           2*N+LWRK_ZGEQRF, 2*N+N**2,  
diff --git a/lapack-netlib/SRC/zggglm.f b/lapack-netlib/SRC/zggglm.f
index 6c24131aa..62b4acdec 100644
--- a/lapack-netlib/SRC/zggglm.f
+++ b/lapack-netlib/SRC/zggglm.f
@@ -289,7 +289,7 @@
 *
       CALL ZGGQRF( N, M, P, A, LDA, WORK, B, LDB, WORK( M+1 ),
      $             WORK( M+NP+1 ), LWORK-M-NP, INFO )
-      LOPT = DBLE( WORK( M+NP+1 ) )
+      LOPT = INT( WORK( M+NP+1 ) )
 *
 *     Update left-hand-side vector d = Q**H*d = ( d1 ) M
 *                                               ( d2 ) N-M
diff --git a/lapack-netlib/SRC/zgglse.f b/lapack-netlib/SRC/zgglse.f
index e5869a7d4..cc558bc40 100644
--- a/lapack-netlib/SRC/zgglse.f
+++ b/lapack-netlib/SRC/zgglse.f
@@ -276,7 +276,7 @@
 *
       CALL ZGGRQF( P, M, N, B, LDB, WORK, A, LDA, WORK( P+1 ),
      $             WORK( P+MN+1 ), LWORK-P-MN, INFO )
-      LOPT = DBLE( WORK( P+MN+1 ) )
+      LOPT = INT( WORK( P+MN+1 ) )
 *
 *     Update c = Z**H *c = ( c1 ) N-P
 *                       ( c2 ) M+P-N
diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f
index 93b1dc0fc..0388b0874 100644
--- a/lapack-netlib/SRC/zggqrf.f
+++ b/lapack-netlib/SRC/zggqrf.f
@@ -276,7 +276,7 @@
 *     QR factorization of N-by-M matrix A: A = Q*R
 *
       CALL ZGEQRF( N, M, A, LDA, TAUA, WORK, LWORK, INFO )
-      LOPT = DBLE( WORK( 1 ) )
+      LOPT = INT( WORK( 1 ) )
 *
 *     Update B := Q**H*B.
 *
diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f
index a2d4a9d55..be912c772 100644
--- a/lapack-netlib/SRC/zggrqf.f
+++ b/lapack-netlib/SRC/zggrqf.f
@@ -275,7 +275,7 @@
 *     RQ factorization of M-by-N matrix A: A = R*Q
 *
       CALL ZGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO )
-      LOPT = DBLE( WORK( 1 ) )
+      LOPT = INT( WORK( 1 ) )
 *
 *     Update B := B*Q**H
 *
diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f
index 2e92255df..eeda656ad 100644
--- a/lapack-netlib/SRC/zhegvd.f
+++ b/lapack-netlib/SRC/zhegvd.f
@@ -360,9 +360,9 @@
       CALL ZHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
       CALL ZHEEVD( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK, LRWORK,
      $             IWORK, LIWORK, INFO )
-      LOPT = MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) )
-      LROPT = MAX( DBLE( LROPT ), DBLE( RWORK( 1 ) ) )
-      LIOPT = MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) )
+      LOPT = INT( MAX( DBLE( LOPT ), DBLE( WORK( 1 ) ) ) )
+      LROPT = INT( MAX( DBLE( LROPT ), DBLE( RWORK( 1 ) ) ) )
+      LIOPT = INT( MAX( DBLE( LIOPT ), DBLE( IWORK( 1 ) ) ) )
 *
       IF( WANTZ .AND. INFO.EQ.0 ) THEN
 *
diff --git a/lapack-netlib/SRC/zhesv_rk.f b/lapack-netlib/SRC/zhesv_rk.f
index 1ec75cc04..6333e9f36 100644
--- a/lapack-netlib/SRC/zhesv_rk.f
+++ b/lapack-netlib/SRC/zhesv_rk.f
@@ -280,7 +280,7 @@
             LWKOPT = 1
          ELSE
             CALL ZHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
-            LWKOPT = DBLE( WORK(1) )
+            LWKOPT = INT( DBLE( WORK( 1 ) ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f
index d27cdc761..e96e39738 100644
--- a/lapack-netlib/SRC/zhpgvd.f
+++ b/lapack-netlib/SRC/zhpgvd.f
@@ -335,9 +335,9 @@
       CALL ZHPGST( ITYPE, UPLO, N, AP, BP, INFO )
       CALL ZHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK, RWORK,
      $             LRWORK, IWORK, LIWORK, INFO )
-      LWMIN = MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) )
-      LRWMIN = MAX( DBLE( LRWMIN ), DBLE( RWORK( 1 ) ) )
-      LIWMIN = MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) )
+      LWMIN = INT( MAX( DBLE( LWMIN ), DBLE( WORK( 1 ) ) ) )
+      LRWMIN = INT( MAX( DBLE( LRWMIN ), DBLE( RWORK( 1 ) ) ) )
+      LIWMIN = INT( MAX( DBLE( LIWMIN ), DBLE( IWORK( 1 ) ) ) )
 *
       IF( WANTZ ) THEN
 *
diff --git a/lapack-netlib/SRC/zlag2c.f b/lapack-netlib/SRC/zlag2c.f
index ba141a98f..434590bb9 100644
--- a/lapack-netlib/SRC/zlag2c.f
+++ b/lapack-netlib/SRC/zlag2c.f
@@ -124,7 +124,7 @@
       DOUBLE PRECISION   RMAX
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          DBLE, DIMAG
+      INTRINSIC          DBLE, DIMAG, CMPLX
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH
@@ -142,7 +142,7 @@
                INFO = 1
                GO TO 30
             END IF
-            SA( I, J ) = A( I, J )
+            SA( I, J ) = CMPLX( A( I, J ) )
    10    CONTINUE
    20 CONTINUE
       INFO = 0
diff --git a/lapack-netlib/SRC/zlaic1.f b/lapack-netlib/SRC/zlaic1.f
index 72948cde9..47927e778 100644
--- a/lapack-netlib/SRC/zlaic1.f
+++ b/lapack-netlib/SRC/zlaic1.f
@@ -348,9 +348,9 @@
                B = ( ZETA2*ZETA2+ZETA1*ZETA1-ONE )*HALF
                C = ZETA1*ZETA1
                IF( B.GE.ZERO ) THEN
-                  T = -C / ( B+SQRT( B*B+C ) )
+                  T = DBLE( -C / ( B+SQRT( B*B+C ) ) )
                ELSE
-                  T = B - SQRT( B*B+C )
+                  T = DBLE( B - SQRT( B*B+C ) )
                END IF
                SINE = -( ALPHA / ABSEST ) / T
                COSINE = -( GAMMA / ABSEST ) / ( ONE+T )
diff --git a/lapack-netlib/SRC/zlat2c.f b/lapack-netlib/SRC/zlat2c.f
index 1d607dcea..a413b05c1 100644
--- a/lapack-netlib/SRC/zlat2c.f
+++ b/lapack-netlib/SRC/zlat2c.f
@@ -130,7 +130,7 @@
       LOGICAL            UPPER
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          DBLE, DIMAG
+      INTRINSIC          DBLE, DIMAG, CMPLX
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH
@@ -151,7 +151,7 @@
                   INFO = 1
                   GO TO 50
                END IF
-               SA( I, J ) = A( I, J )
+               SA( I, J ) = CMPLX( A( I, J ) )
    10       CONTINUE
    20    CONTINUE
       ELSE
@@ -164,7 +164,7 @@
                   INFO = 1
                   GO TO 50
                END IF
-               SA( I, J ) = A( I, J )
+               SA( I, J ) = CMPLX( A( I, J ) )
    30       CONTINUE
    40    CONTINUE
       END IF
diff --git a/lapack-netlib/SRC/zsysv.f b/lapack-netlib/SRC/zsysv.f
index ed173dadc..44f1e25b1 100644
--- a/lapack-netlib/SRC/zsysv.f
+++ b/lapack-netlib/SRC/zsysv.f
@@ -223,7 +223,7 @@
             LWKOPT = 1
          ELSE
             CALL ZSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-            LWKOPT = DBLE( WORK(1) )
+            LWKOPT = INT( DBLE( WORK( 1 ) ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/zsysv_rk.f b/lapack-netlib/SRC/zsysv_rk.f
index df828ee33..8d9fb82c8 100644
--- a/lapack-netlib/SRC/zsysv_rk.f
+++ b/lapack-netlib/SRC/zsysv_rk.f
@@ -280,7 +280,7 @@
             LWKOPT = 1
          ELSE
             CALL ZSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
-            LWKOPT = DBLE( WORK(1) )
+            LWKOPT = INT( DBLE( WORK( 1 ) ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/zsysv_rook.f b/lapack-netlib/SRC/zsysv_rook.f
index 7c9fb4bf6..745339512 100644
--- a/lapack-netlib/SRC/zsysv_rook.f
+++ b/lapack-netlib/SRC/zsysv_rook.f
@@ -256,7 +256,7 @@
             LWKOPT = 1
          ELSE
             CALL ZSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-            LWKOPT = DBLE( WORK(1) )
+            LWKOPT = INT( DBLE( WORK( 1 ) ) )
          END IF
          WORK( 1 ) = LWKOPT
       END IF
diff --git a/lapack-netlib/SRC/zungbr.f b/lapack-netlib/SRC/zungbr.f
index 3dfca43be..c42a372c5 100644
--- a/lapack-netlib/SRC/zungbr.f
+++ b/lapack-netlib/SRC/zungbr.f
@@ -233,7 +233,7 @@
                END IF
             END IF
          END IF
-         LWKOPT = DBLE( WORK( 1 ) )
+         LWKOPT = INT( DBLE( WORK( 1 ) ) )
          LWKOPT = MAX (LWKOPT, MN)
       END IF
 *

From 15967809adb4275a1c5b11cd9a3fc10be3b13c3e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 19:15:09 +0100
Subject: [PATCH 112/154] Define type conversions explicitly (Reference-LAPACK
 PR703)

---
 lapack-netlib/SRC/cggrqf.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f
index b43febc1f..5227100da 100644
--- a/lapack-netlib/SRC/cggrqf.f
+++ b/lapack-netlib/SRC/cggrqf.f
@@ -275,7 +275,7 @@
 *     RQ factorization of M-by-N matrix A: A = R*Q
 *
       CALL CGERQF( M, N, A, LDA, TAUA, WORK, LWORK, INFO )
-      LOPT = REAL( WORK( 1 ) )
+      LOPT = INT( WORK( 1 ) )
 *
 *     Update B := B*Q**H
 *

From 63014e99ae33751da236a7f2bf90af8113af89ec Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 22:31:33 +0100
Subject: [PATCH 113/154] Cast work array sizes to integer (Reference-LAPACK PR
 684)

---
 lapack-netlib/SRC/cgelss.f | 26 +++++++++++++-------------
 lapack-netlib/SRC/sgelss.f | 24 ++++++++++++------------
 lapack-netlib/SRC/zgelss.f | 26 +++++++++++++-------------
 3 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f
index 04defbb2e..da6b9092f 100644
--- a/lapack-netlib/SRC/cgelss.f
+++ b/lapack-netlib/SRC/cgelss.f
@@ -266,11 +266,11 @@
 *
 *              Compute space needed for CGEQRF
                CALL CGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO )
-               LWORK_CGEQRF = REAL( DUM(1) )
+               LWORK_CGEQRF = INT( DUM(1) )
 *              Compute space needed for CUNMQR
                CALL CUNMQR( 'L', 'C', M, NRHS, N, A, LDA, DUM(1), B,
      $                   LDB, DUM(1), -1, INFO )
-               LWORK_CUNMQR = REAL( DUM(1) )
+               LWORK_CUNMQR = INT( DUM(1) )
                MM = N
                MAXWRK = MAX( MAXWRK, N + N*ILAENV( 1, 'CGEQRF', ' ', M,
      $                       N, -1, -1 ) )
@@ -284,15 +284,15 @@
 *              Compute space needed for CGEBRD
                CALL CGEBRD( MM, N, A, LDA, S, S, DUM(1), DUM(1), DUM(1),
      $                      -1, INFO )
-               LWORK_CGEBRD = REAL( DUM(1) )
+               LWORK_CGEBRD = INT( DUM(1) )
 *              Compute space needed for CUNMBR
                CALL CUNMBR( 'Q', 'L', 'C', MM, NRHS, N, A, LDA, DUM(1),
      $                B, LDB, DUM(1), -1, INFO )
-               LWORK_CUNMBR = REAL( DUM(1) )
+               LWORK_CUNMBR = INT( DUM(1) )
 *              Compute space needed for CUNGBR
                CALL CUNGBR( 'P', N, N, N, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-               LWORK_CUNGBR = REAL( DUM(1) )
+               LWORK_CUNGBR = INT( DUM(1) )
 *              Compute total workspace needed
                MAXWRK = MAX( MAXWRK, 2*N + LWORK_CGEBRD )
                MAXWRK = MAX( MAXWRK, 2*N + LWORK_CUNMBR )
@@ -310,23 +310,23 @@
 *                 Compute space needed for CGELQF
                   CALL CGELQF( M, N, A, LDA, DUM(1), DUM(1),
      $                -1, INFO )
-                  LWORK_CGELQF = REAL( DUM(1) )
+                  LWORK_CGELQF = INT( DUM(1) )
 *                 Compute space needed for CGEBRD
                   CALL CGEBRD( M, M, A, LDA, S, S, DUM(1), DUM(1),
      $                         DUM(1), -1, INFO )
-                  LWORK_CGEBRD = REAL( DUM(1) )
+                  LWORK_CGEBRD = INT( DUM(1) )
 *                 Compute space needed for CUNMBR
                   CALL CUNMBR( 'Q', 'L', 'C', M, NRHS, N, A, LDA,
      $                DUM(1), B, LDB, DUM(1), -1, INFO )
-                  LWORK_CUNMBR = REAL( DUM(1) )
+                  LWORK_CUNMBR = INT( DUM(1) )
 *                 Compute space needed for CUNGBR
                   CALL CUNGBR( 'P', M, M, M, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-                  LWORK_CUNGBR = REAL( DUM(1) )
+                  LWORK_CUNGBR = INT( DUM(1) )
 *                 Compute space needed for CUNMLQ
                   CALL CUNMLQ( 'L', 'C', N, NRHS, M, A, LDA, DUM(1),
      $                 B, LDB, DUM(1), -1, INFO )
-                  LWORK_CUNMLQ = REAL( DUM(1) )
+                  LWORK_CUNMLQ = INT( DUM(1) )
 *                 Compute total workspace needed
                   MAXWRK = M + LWORK_CGELQF
                   MAXWRK = MAX( MAXWRK, 3*M + M*M + LWORK_CGEBRD )
@@ -345,15 +345,15 @@
 *                 Compute space needed for CGEBRD
                   CALL CGEBRD( M, N, A, LDA, S, S, DUM(1), DUM(1),
      $                         DUM(1), -1, INFO )
-                  LWORK_CGEBRD = REAL( DUM(1) )
+                  LWORK_CGEBRD = INT( DUM(1) )
 *                 Compute space needed for CUNMBR
                   CALL CUNMBR( 'Q', 'L', 'C', M, NRHS, M, A, LDA,
      $                DUM(1), B, LDB, DUM(1), -1, INFO )
-                  LWORK_CUNMBR = REAL( DUM(1) )
+                  LWORK_CUNMBR = INT( DUM(1) )
 *                 Compute space needed for CUNGBR
                   CALL CUNGBR( 'P', M, N, M, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-                  LWORK_CUNGBR = REAL( DUM(1) )
+                  LWORK_CUNGBR = INT( DUM(1) )
                   MAXWRK = 2*M + LWORK_CGEBRD
                   MAXWRK = MAX( MAXWRK, 2*M + LWORK_CUNMBR )
                   MAXWRK = MAX( MAXWRK, 2*M + LWORK_CUNGBR )
diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f
index be9e2ea11..9aed4329f 100644
--- a/lapack-netlib/SRC/sgelss.f
+++ b/lapack-netlib/SRC/sgelss.f
@@ -253,11 +253,11 @@
 *
 *              Compute space needed for SGEQRF
                CALL SGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO )
-               LWORK_SGEQRF=DUM(1)
+               LWORK_SGEQRF = INT( DUM(1) )
 *              Compute space needed for SORMQR
                CALL SORMQR( 'L', 'T', M, NRHS, N, A, LDA, DUM(1), B,
      $                   LDB, DUM(1), -1, INFO )
-               LWORK_SORMQR=DUM(1)
+               LWORK_SORMQR = INT( DUM(1) )
                MM = N
                MAXWRK = MAX( MAXWRK, N + LWORK_SGEQRF )
                MAXWRK = MAX( MAXWRK, N + LWORK_SORMQR )
@@ -272,15 +272,15 @@
 *              Compute space needed for SGEBRD
                CALL SGEBRD( MM, N, A, LDA, S, DUM(1), DUM(1),
      $                      DUM(1), DUM(1), -1, INFO )
-               LWORK_SGEBRD=DUM(1)
+               LWORK_SGEBRD = INT( DUM(1) )
 *              Compute space needed for SORMBR
                CALL SORMBR( 'Q', 'L', 'T', MM, NRHS, N, A, LDA, DUM(1),
      $                B, LDB, DUM(1), -1, INFO )
-               LWORK_SORMBR=DUM(1)
+               LWORK_SORMBR = INT( DUM(1) )
 *              Compute space needed for SORGBR
                CALL SORGBR( 'P', N, N, N, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-               LWORK_SORGBR=DUM(1)
+               LWORK_SORGBR = INT( DUM(1) )
 *              Compute total workspace needed
                MAXWRK = MAX( MAXWRK, 3*N + LWORK_SGEBRD )
                MAXWRK = MAX( MAXWRK, 3*N + LWORK_SORMBR )
@@ -304,19 +304,19 @@
 *                 Compute space needed for SGEBRD
                   CALL SGEBRD( M, M, A, LDA, S, DUM(1), DUM(1),
      $                      DUM(1), DUM(1), -1, INFO )
-                  LWORK_SGEBRD=DUM(1)
+                  LWORK_SGEBRD = INT( DUM(1) )
 *                 Compute space needed for SORMBR
                   CALL SORMBR( 'Q', 'L', 'T', M, NRHS, N, A, LDA,
      $                DUM(1), B, LDB, DUM(1), -1, INFO )
-                  LWORK_SORMBR=DUM(1)
+                  LWORK_SORMBR = INT( DUM(1) )
 *                 Compute space needed for SORGBR
                   CALL SORGBR( 'P', M, M, M, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-                  LWORK_SORGBR=DUM(1)
+                  LWORK_SORGBR = INT( DUM(1) )
 *                 Compute space needed for SORMLQ
                   CALL SORMLQ( 'L', 'T', N, NRHS, M, A, LDA, DUM(1),
      $                 B, LDB, DUM(1), -1, INFO )
-                  LWORK_SORMLQ=DUM(1)
+                  LWORK_SORMLQ = INT( DUM(1) )
 *                 Compute total workspace needed
                   MAXWRK = M + M*ILAENV( 1, 'SGELQF', ' ', M, N, -1,
      $                                  -1 )
@@ -337,15 +337,15 @@
 *                 Compute space needed for SGEBRD
                   CALL SGEBRD( M, N, A, LDA, S, DUM(1), DUM(1),
      $                      DUM(1), DUM(1), -1, INFO )
-                  LWORK_SGEBRD=DUM(1)
+                  LWORK_SGEBRD = INT( DUM(1) )
 *                 Compute space needed for SORMBR
                   CALL SORMBR( 'Q', 'L', 'T', M, NRHS, M, A, LDA,
      $                DUM(1), B, LDB, DUM(1), -1, INFO )
-                  LWORK_SORMBR=DUM(1)
+                  LWORK_SORMBR = INT( DUM(1) )
 *                 Compute space needed for SORGBR
                   CALL SORGBR( 'P', M, N, M, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-                  LWORK_SORGBR=DUM(1)
+                  LWORK_SORGBR = INT( DUM(1) )
                   MAXWRK = 3*M + LWORK_SGEBRD
                   MAXWRK = MAX( MAXWRK, 3*M + LWORK_SORMBR )
                   MAXWRK = MAX( MAXWRK, 3*M + LWORK_SORGBR )
diff --git a/lapack-netlib/SRC/zgelss.f b/lapack-netlib/SRC/zgelss.f
index e4aba6497..be53ba95b 100644
--- a/lapack-netlib/SRC/zgelss.f
+++ b/lapack-netlib/SRC/zgelss.f
@@ -266,11 +266,11 @@
 *
 *              Compute space needed for ZGEQRF
                CALL ZGEQRF( M, N, A, LDA, DUM(1), DUM(1), -1, INFO )
-               LWORK_ZGEQRF = DBLE( DUM(1) )
+               LWORK_ZGEQRF = INT( DUM(1) )
 *              Compute space needed for ZUNMQR
                CALL ZUNMQR( 'L', 'C', M, NRHS, N, A, LDA, DUM(1), B,
      $                   LDB, DUM(1), -1, INFO )
-               LWORK_ZUNMQR = DBLE( DUM(1) )
+               LWORK_ZUNMQR = INT( DUM(1) )
                MM = N
                MAXWRK = MAX( MAXWRK, N + N*ILAENV( 1, 'ZGEQRF', ' ', M,
      $                       N, -1, -1 ) )
@@ -284,15 +284,15 @@
 *              Compute space needed for ZGEBRD
                CALL ZGEBRD( MM, N, A, LDA, S, S, DUM(1), DUM(1), DUM(1),
      $                      -1, INFO )
-               LWORK_ZGEBRD = DBLE( DUM(1) )
+               LWORK_ZGEBRD = INT( DUM(1) )
 *              Compute space needed for ZUNMBR
                CALL ZUNMBR( 'Q', 'L', 'C', MM, NRHS, N, A, LDA, DUM(1),
      $                B, LDB, DUM(1), -1, INFO )
-               LWORK_ZUNMBR = DBLE( DUM(1) )
+               LWORK_ZUNMBR = INT( DUM(1) )
 *              Compute space needed for ZUNGBR
                CALL ZUNGBR( 'P', N, N, N, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-               LWORK_ZUNGBR = DBLE( DUM(1) )
+               LWORK_ZUNGBR = INT( DUM(1) )
 *              Compute total workspace needed
                MAXWRK = MAX( MAXWRK, 2*N + LWORK_ZGEBRD )
                MAXWRK = MAX( MAXWRK, 2*N + LWORK_ZUNMBR )
@@ -310,23 +310,23 @@
 *                 Compute space needed for ZGELQF
                   CALL ZGELQF( M, N, A, LDA, DUM(1), DUM(1),
      $                -1, INFO )
-                  LWORK_ZGELQF = DBLE( DUM(1) )
+                  LWORK_ZGELQF = INT( DUM(1) )
 *                 Compute space needed for ZGEBRD
                   CALL ZGEBRD( M, M, A, LDA, S, S, DUM(1), DUM(1),
      $                         DUM(1), -1, INFO )
-                  LWORK_ZGEBRD = DBLE( DUM(1) )
+                  LWORK_ZGEBRD = INT( DUM(1) )
 *                 Compute space needed for ZUNMBR
                   CALL ZUNMBR( 'Q', 'L', 'C', M, NRHS, N, A, LDA,
      $                DUM(1), B, LDB, DUM(1), -1, INFO )
-                  LWORK_ZUNMBR = DBLE( DUM(1) )
+                  LWORK_ZUNMBR = INT( DUM(1) )
 *                 Compute space needed for ZUNGBR
                   CALL ZUNGBR( 'P', M, M, M, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-                  LWORK_ZUNGBR = DBLE( DUM(1) )
+                  LWORK_ZUNGBR = INT( DUM(1) )
 *                 Compute space needed for ZUNMLQ
                   CALL ZUNMLQ( 'L', 'C', N, NRHS, M, A, LDA, DUM(1),
      $                 B, LDB, DUM(1), -1, INFO )
-                  LWORK_ZUNMLQ = DBLE( DUM(1) )
+                  LWORK_ZUNMLQ = INT( DUM(1) )
 *                 Compute total workspace needed
                   MAXWRK = M + LWORK_ZGELQF
                   MAXWRK = MAX( MAXWRK, 3*M + M*M + LWORK_ZGEBRD )
@@ -345,15 +345,15 @@
 *                 Compute space needed for ZGEBRD
                   CALL ZGEBRD( M, N, A, LDA, S, S, DUM(1), DUM(1),
      $                         DUM(1), -1, INFO )
-                  LWORK_ZGEBRD = DBLE( DUM(1) )
+                  LWORK_ZGEBRD = INT( DUM(1) )
 *                 Compute space needed for ZUNMBR
                   CALL ZUNMBR( 'Q', 'L', 'C', M, NRHS, M, A, LDA,
      $                DUM(1), B, LDB, DUM(1), -1, INFO )
-                  LWORK_ZUNMBR = DBLE( DUM(1) )
+                  LWORK_ZUNMBR = INT( DUM(1) )
 *                 Compute space needed for ZUNGBR
                   CALL ZUNGBR( 'P', M, N, M, A, LDA, DUM(1),
      $                   DUM(1), -1, INFO )
-                  LWORK_ZUNGBR = DBLE( DUM(1) )
+                  LWORK_ZUNGBR = INT( DUM(1) )
                   MAXWRK = 2*M + LWORK_ZGEBRD
                   MAXWRK = MAX( MAXWRK, 2*M + LWORK_ZUNMBR )
                   MAXWRK = MAX( MAXWRK, 2*M + LWORK_ZUNGBR )

From d0afbd8d29f3405f2a670bbc72c264d4d54d5b24 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 22:34:42 +0100
Subject: [PATCH 114/154] Add new routines for ?GELST similar to ?GELS
 (Reference-LAPACK PR739)

---
 lapack-netlib/SRC/cgelst.f | 533 +++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dgelst.f | 531 ++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/sgelst.f | 531 ++++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zgelst.f | 533 +++++++++++++++++++++++++++++++++++++
 4 files changed, 2128 insertions(+)
 create mode 100644 lapack-netlib/SRC/cgelst.f
 create mode 100644 lapack-netlib/SRC/dgelst.f
 create mode 100644 lapack-netlib/SRC/sgelst.f
 create mode 100644 lapack-netlib/SRC/zgelst.f

diff --git a/lapack-netlib/SRC/cgelst.f b/lapack-netlib/SRC/cgelst.f
new file mode 100644
index 000000000..7d8e44ddf
--- /dev/null
+++ b/lapack-netlib/SRC/cgelst.f
@@ -0,0 +1,533 @@
+*> \brief <b> CGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q.</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGELST + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgelst.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgelst.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgelst.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK,
+*                          INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          TRANS
+*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            A( LDA, * ), B( LDB, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGELST solves overdetermined or underdetermined real linear systems
+*> involving an M-by-N matrix A, or its conjugate-transpose, using a QR
+*> or LQ factorization of A with compact WY representation of Q.
+*> It is assumed that A has full rank.
+*>
+*> The following options are provided:
+*>
+*> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+*>    an overdetermined system, i.e., solve the least squares problem
+*>                 minimize || B - A*X ||.
+*>
+*> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+*>    an underdetermined system A * X = B.
+*>
+*> 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of
+*>    an underdetermined system A**T * X = B.
+*>
+*> 4. If TRANS = 'C' and m < n:  find the least squares solution of
+*>    an overdetermined system, i.e., solve the least squares problem
+*>                 minimize || B - A**T * X ||.
+*>
+*> Several right hand side vectors b and solution vectors x can be
+*> handled in a single call; they are stored as the columns of the
+*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+*> matrix X.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': the linear system involves A;
+*>          = 'C': the linear system involves A**H.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrices B and X. NRHS >=0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*>          On exit,
+*>            if M >= N, A is overwritten by details of its QR
+*>                       factorization as returned by CGEQRT;
+*>            if M <  N, A is overwritten by details of its LQ
+*>                       factorization as returned by CGELQT.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,NRHS)
+*>          On entry, the matrix B of right hand side vectors, stored
+*>          columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS
+*>          if TRANS = 'C'.
+*>          On exit, if INFO = 0, B is overwritten by the solution
+*>          vectors, stored columnwise:
+*>          if TRANS = 'N' and m >= n, rows 1 to n of B contain the least
+*>          squares solution vectors; the residual sum of squares for the
+*>          solution in each column is given by the sum of squares of
+*>          modulus of elements N+1 to M in that column;
+*>          if TRANS = 'N' and m < n, rows 1 to N of B contain the
+*>          minimum norm solution vectors;
+*>          if TRANS = 'C' and m >= n, rows 1 to M of B contain the
+*>          minimum norm solution vectors;
+*>          if TRANS = 'C' and m < n, rows 1 to M of B contain the
+*>          least squares solution vectors; the residual sum of squares
+*>          for the solution in each column is given by the sum of
+*>          squares of the modulus of elements M+1 to N in that column.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= MAX(1,M,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= max( 1, MN + max( MN, NRHS ) ).
+*>          For optimal performance,
+*>          LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ).
+*>          where MN = min(M,N) and NB is the optimum block size.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO =  i, the i-th diagonal element of the
+*>                triangular factor of A is zero, so that A does not have
+*>                full rank; the least squares solution could not be
+*>                computed.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexGEsolve
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November 2022,  Igor Kozachenko,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK,
+     $                   INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          TRANS
+      INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), B( LDB, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      COMPLEX            CZERO
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, TPSD
+      INTEGER            BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS,
+     $                   NB, NBMIN, SCLLEN
+      REAL               ANRM, BIGNUM, BNRM, SMLNUM
+*     ..
+*     .. Local Arrays ..
+      REAL               RWORK( 1 )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, CLANGE
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGELQT, CGEQRT, CGEMLQT, CGEMQRT, SLABAD,
+     $                   CLASCL, CLASET, CTRTRS, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments.
+*
+      INFO = 0
+      MN = MIN( M, N )
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'C' ) ) ) THEN
+         INFO = -1
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY )
+     $          THEN
+         INFO = -10
+      END IF
+*
+*     Figure out optimal block size and optimal workspace size
+*
+      IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN
+*
+         TPSD = .TRUE.
+         IF( LSAME( TRANS, 'N' ) )
+     $      TPSD = .FALSE.
+*
+         NB = ILAENV( 1, 'CGELST', ' ', M, N, -1, -1 )
+*
+         MNNRHS = MAX( MN, NRHS )
+         LWOPT = MAX( 1, (MN+MNNRHS)*NB )
+         WORK( 1 ) = REAL( LWOPT )
+*
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGELST ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB )
+         WORK( 1 ) = REAL( LWOPT )
+         RETURN
+      END IF
+*
+*     *GEQRT and *GELQT routines cannot accept NB larger than min(M,N)
+*
+      IF( NB.GT.MN ) NB = MN
+*
+*     Determine the block size from the supplied LWORK
+*     ( at this stage we know that LWORK >= (minimum required workspace,
+*     but it may be less than optimal)
+*
+      NB = MIN( NB, LWORK/( MN + MNNRHS ) )
+*
+*     The minimum value of NB, when blocked code is used
+*
+      NBMIN = MAX( 2, ILAENV( 2, 'CGELST', ' ', M, N, -1, -1 ) )
+*
+      IF( NB.LT.NBMIN ) THEN
+         NB = 1
+      END IF
+*
+*     Get machine parameters
+*
+      SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
+      BIGNUM = ONE / SMLNUM
+      CALL SLABAD( SMLNUM, BIGNUM )
+*
+*     Scale A, B if max element outside range [SMLNUM,BIGNUM]
+*
+      ANRM = CLANGE( 'M', M, N, A, LDA, RWORK )
+      IASCL = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
+*
+*        Scale matrix norm up to SMLNUM
+*
+         CALL CLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO )
+         IASCL = 1
+      ELSE IF( ANRM.GT.BIGNUM ) THEN
+*
+*        Scale matrix norm down to BIGNUM
+*
+         CALL CLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO )
+         IASCL = 2
+      ELSE IF( ANRM.EQ.ZERO ) THEN
+*
+*        Matrix all zero. Return zero solution.
+*
+         CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB )
+         WORK( 1 ) = REAL( LWOPT )
+         RETURN
+      END IF
+*
+      BROW = M
+      IF( TPSD )
+     $   BROW = N
+      BNRM = CLANGE( 'M', BROW, NRHS, B, LDB, RWORK )
+      IBSCL = 0
+      IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN
+*
+*        Scale matrix norm up to SMLNUM
+*
+         CALL CLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB,
+     $                INFO )
+         IBSCL = 1
+      ELSE IF( BNRM.GT.BIGNUM ) THEN
+*
+*        Scale matrix norm down to BIGNUM
+*
+         CALL CLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB,
+     $                INFO )
+         IBSCL = 2
+      END IF
+*
+      IF( M.GE.N ) THEN
+*
+*        M > N:
+*        Compute the blocked QR factorization of A,
+*        using the compact WY representation of Q,
+*        workspace at least N, optimally N*NB.
+*
+         CALL CGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB,
+     $                WORK( MN*NB+1 ), INFO )
+*
+         IF( .NOT.TPSD ) THEN
+*
+*           M > N, A is not transposed:
+*           Overdetermined system of equations,
+*           least-squares problem, min || A * X - B ||.
+*
+*           Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL CGEMQRT( 'Left', 'Conjugate transpose', M, NRHS, N, NB,
+     $                    A, LDA, WORK( 1 ), NB, B, LDB,
+     $                    WORK( MN*NB+1 ), INFO )
+*
+*           Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS)
+*
+            CALL CTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+            SCLLEN = N
+*
+         ELSE
+*
+*           M > N, A is transposed:
+*           Underdetermined system of equations,
+*           minimum norm solution of A**T * X = B.
+*
+*           Compute B := inv(R**T) * B in two row blocks of B.
+*
+*           Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS)
+*
+            CALL CTRTRS( 'Upper', 'Conjugate transpose', 'Non-unit',
+     $                   N, NRHS, A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+*           Block 2: Zero out all rows below the N-th row in B:
+*           B(N+1:M,1:NRHS) = ZERO
+*
+            DO  J = 1, NRHS
+               DO I = N + 1, M
+                  B( I, J ) = ZERO
+               END DO
+            END DO
+*
+*           Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL CGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB,
+     $                    A, LDA, WORK( 1 ), NB, B, LDB,
+     $                    WORK( MN*NB+1 ), INFO )
+*
+            SCLLEN = M
+*
+         END IF
+*
+      ELSE
+*
+*        M < N:
+*        Compute the blocked LQ factorization of A,
+*        using the compact WY representation of Q,
+*        workspace at least M, optimally M*NB.
+*
+         CALL CGELQT( M, N, NB, A, LDA, WORK( 1 ), NB,
+     $                WORK( MN*NB+1 ), INFO )
+*
+         IF( .NOT.TPSD ) THEN
+*
+*           M < N, A is not transposed:
+*           Underdetermined system of equations,
+*           minimum norm solution of A * X = B.
+*
+*           Compute B := inv(L) * B in two row blocks of B.
+*
+*           Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS)
+*
+            CALL CTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+*           Block 2: Zero out all rows below the M-th row in B:
+*           B(M+1:N,1:NRHS) = ZERO
+*
+            DO J = 1, NRHS
+               DO I = M + 1, N
+                  B( I, J ) = ZERO
+               END DO
+            END DO
+*
+*           Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL CGEMLQT( 'Left', 'Conjugate transpose', N, NRHS, M, NB,
+     $                   A, LDA, WORK( 1 ), NB, B, LDB,
+     $                   WORK( MN*NB+1 ), INFO )
+*
+            SCLLEN = N
+*
+         ELSE
+*
+*           M < N, A is transposed:
+*           Overdetermined system of equations,
+*           least-squares problem, min || A**T * X - B ||.
+*
+*           Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL CGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB,
+     $                    A, LDA, WORK( 1 ), NB, B, LDB,
+     $                    WORK( MN*NB+1), INFO )
+*
+*           Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS)
+*
+            CALL CTRTRS( 'Lower', 'Conjugate transpose', 'Non-unit',
+     $                   M, NRHS, A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+            SCLLEN = M
+*
+         END IF
+*
+      END IF
+*
+*     Undo scaling
+*
+      IF( IASCL.EQ.1 ) THEN
+         CALL CLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      ELSE IF( IASCL.EQ.2 ) THEN
+         CALL CLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      END IF
+      IF( IBSCL.EQ.1 ) THEN
+         CALL CLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      ELSE IF( IBSCL.EQ.2 ) THEN
+         CALL CLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      END IF
+*
+      WORK( 1 ) = REAL( LWOPT )
+*
+      RETURN
+*
+*     End of CGELST
+*
+      END
diff --git a/lapack-netlib/SRC/dgelst.f b/lapack-netlib/SRC/dgelst.f
new file mode 100644
index 000000000..ca0e04a9b
--- /dev/null
+++ b/lapack-netlib/SRC/dgelst.f
@@ -0,0 +1,531 @@
+*> \brief <b> DGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q.</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGELST + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgelst.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgelst.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgelst.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK,
+*                          INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          TRANS
+*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGELST solves overdetermined or underdetermined real linear systems
+*> involving an M-by-N matrix A, or its transpose, using a QR or LQ
+*> factorization of A with compact WY representation of Q.
+*> It is assumed that A has full rank.
+*>
+*> The following options are provided:
+*>
+*> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+*>    an overdetermined system, i.e., solve the least squares problem
+*>                 minimize || B - A*X ||.
+*>
+*> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+*>    an underdetermined system A * X = B.
+*>
+*> 3. If TRANS = 'T' and m >= n:  find the minimum norm solution of
+*>    an underdetermined system A**T * X = B.
+*>
+*> 4. If TRANS = 'T' and m < n:  find the least squares solution of
+*>    an overdetermined system, i.e., solve the least squares problem
+*>                 minimize || B - A**T * X ||.
+*>
+*> Several right hand side vectors b and solution vectors x can be
+*> handled in a single call; they are stored as the columns of the
+*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+*> matrix X.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': the linear system involves A;
+*>          = 'T': the linear system involves A**T.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrices B and X. NRHS >=0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*>          On exit,
+*>            if M >= N, A is overwritten by details of its QR
+*>                       factorization as returned by DGEQRT;
+*>            if M <  N, A is overwritten by details of its LQ
+*>                       factorization as returned by DGELQT.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,NRHS)
+*>          On entry, the matrix B of right hand side vectors, stored
+*>          columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS
+*>          if TRANS = 'T'.
+*>          On exit, if INFO = 0, B is overwritten by the solution
+*>          vectors, stored columnwise:
+*>          if TRANS = 'N' and m >= n, rows 1 to n of B contain the least
+*>          squares solution vectors; the residual sum of squares for the
+*>          solution in each column is given by the sum of squares of
+*>          elements N+1 to M in that column;
+*>          if TRANS = 'N' and m < n, rows 1 to N of B contain the
+*>          minimum norm solution vectors;
+*>          if TRANS = 'T' and m >= n, rows 1 to M of B contain the
+*>          minimum norm solution vectors;
+*>          if TRANS = 'T' and m < n, rows 1 to M of B contain the
+*>          least squares solution vectors; the residual sum of squares
+*>          for the solution in each column is given by the sum of
+*>          squares of elements M+1 to N in that column.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= MAX(1,M,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= max( 1, MN + max( MN, NRHS ) ).
+*>          For optimal performance,
+*>          LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ).
+*>          where MN = min(M,N) and NB is the optimum block size.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO =  i, the i-th diagonal element of the
+*>                triangular factor of A is zero, so that A does not have
+*>                full rank; the least squares solution could not be
+*>                computed.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleGEsolve
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November 2022,  Igor Kozachenko,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK,
+     $                   INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          TRANS
+      INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, TPSD
+      INTEGER            BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS,
+     $                   NB, NBMIN, SCLLEN
+      DOUBLE PRECISION   ANRM, BIGNUM, BNRM, SMLNUM
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   RWORK( 1 )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, DLANGE
+      EXTERNAL           LSAME, ILAENV, DLAMCH, DLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGELQT, DGEQRT, DGEMLQT, DGEMQRT, DLABAD,
+     $                   DLASCL, DLASET, DTRTRS, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments.
+*
+      INFO = 0
+      MN = MIN( M, N )
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) ) ) THEN
+         INFO = -1
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY )
+     $          THEN
+         INFO = -10
+      END IF
+*
+*     Figure out optimal block size and optimal workspace size
+*
+      IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN
+*
+         TPSD = .TRUE.
+         IF( LSAME( TRANS, 'N' ) )
+     $      TPSD = .FALSE.
+*
+         NB = ILAENV( 1, 'DGELST', ' ', M, N, -1, -1 )
+*
+         MNNRHS = MAX( MN, NRHS )
+         LWOPT = MAX( 1, (MN+MNNRHS)*NB )
+         WORK( 1 ) = DBLE( LWOPT )
+*
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGELST ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         CALL DLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB )
+         WORK( 1 ) = DBLE( LWOPT )
+         RETURN
+      END IF
+*
+*     *GEQRT and *GELQT routines cannot accept NB larger than min(M,N)
+*
+      IF( NB.GT.MN ) NB = MN
+*
+*     Determine the block size from the supplied LWORK
+*     ( at this stage we know that LWORK >= (minimum required workspace,
+*     but it may be less than optimal)
+*
+      NB = MIN( NB, LWORK/( MN + MNNRHS ) )
+*
+*     The minimum value of NB, when blocked code is used
+*
+      NBMIN = MAX( 2, ILAENV( 2, 'DGELST', ' ', M, N, -1, -1 ) )
+*
+      IF( NB.LT.NBMIN ) THEN
+         NB = 1
+      END IF
+*
+*     Get machine parameters
+*
+      SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
+      BIGNUM = ONE / SMLNUM
+      CALL DLABAD( SMLNUM, BIGNUM )
+*
+*     Scale A, B if max element outside range [SMLNUM,BIGNUM]
+*
+      ANRM = DLANGE( 'M', M, N, A, LDA, RWORK )
+      IASCL = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
+*
+*        Scale matrix norm up to SMLNUM
+*
+         CALL DLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO )
+         IASCL = 1
+      ELSE IF( ANRM.GT.BIGNUM ) THEN
+*
+*        Scale matrix norm down to BIGNUM
+*
+         CALL DLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO )
+         IASCL = 2
+      ELSE IF( ANRM.EQ.ZERO ) THEN
+*
+*        Matrix all zero. Return zero solution.
+*
+         CALL DLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB )
+         WORK( 1 ) = DBLE( LWOPT )
+         RETURN
+      END IF
+*
+      BROW = M
+      IF( TPSD )
+     $   BROW = N
+      BNRM = DLANGE( 'M', BROW, NRHS, B, LDB, RWORK )
+      IBSCL = 0
+      IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN
+*
+*        Scale matrix norm up to SMLNUM
+*
+         CALL DLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB,
+     $                INFO )
+         IBSCL = 1
+      ELSE IF( BNRM.GT.BIGNUM ) THEN
+*
+*        Scale matrix norm down to BIGNUM
+*
+         CALL DLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB,
+     $                INFO )
+         IBSCL = 2
+      END IF
+*
+      IF( M.GE.N ) THEN
+*
+*        M > N:
+*        Compute the blocked QR factorization of A,
+*        using the compact WY representation of Q,
+*        workspace at least N, optimally N*NB.
+*
+         CALL DGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB,
+     $                WORK( MN*NB+1 ), INFO )
+*
+         IF( .NOT.TPSD ) THEN
+*
+*           M > N, A is not transposed:
+*           Overdetermined system of equations,
+*           least-squares problem, min || A * X - B ||.
+*
+*           Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL DGEMQRT( 'Left', 'Transpose', M, NRHS, N, NB, A, LDA,
+     $                    WORK( 1 ), NB, B, LDB, WORK( MN*NB+1 ),
+     $                    INFO )
+*
+*           Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS)
+*
+            CALL DTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+            SCLLEN = N
+*
+         ELSE
+*
+*           M > N, A is transposed:
+*           Underdetermined system of equations,
+*           minimum norm solution of A**T * X = B.
+*
+*           Compute B := inv(R**T) * B in two row blocks of B.
+*
+*           Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS)
+*
+            CALL DTRTRS( 'Upper', 'Transpose', 'Non-unit', N, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+*           Block 2: Zero out all rows below the N-th row in B:
+*           B(N+1:M,1:NRHS) = ZERO
+*
+            DO  J = 1, NRHS
+               DO I = N + 1, M
+                  B( I, J ) = ZERO
+               END DO
+            END DO
+*
+*           Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL DGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB,
+     $                    A, LDA, WORK( 1 ), NB, B, LDB,
+     $                    WORK( MN*NB+1 ), INFO )
+*
+            SCLLEN = M
+*
+         END IF
+*
+      ELSE
+*
+*        M < N:
+*        Compute the blocked LQ factorization of A,
+*        using the compact WY representation of Q,
+*        workspace at least M, optimally M*NB.
+*
+         CALL DGELQT( M, N, NB, A, LDA, WORK( 1 ), NB,
+     $                WORK( MN*NB+1 ), INFO )
+*
+         IF( .NOT.TPSD ) THEN
+*
+*           M < N, A is not transposed:
+*           Underdetermined system of equations,
+*           minimum norm solution of A * X = B.
+*
+*           Compute B := inv(L) * B in two row blocks of B.
+*
+*           Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS)
+*
+            CALL DTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+*           Block 2: Zero out all rows below the M-th row in B:
+*           B(M+1:N,1:NRHS) = ZERO
+*
+            DO J = 1, NRHS
+               DO I = M + 1, N
+                  B( I, J ) = ZERO
+               END DO
+            END DO
+*
+*           Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL DGEMLQT( 'Left', 'Transpose', N, NRHS, M, NB, A, LDA,
+     $                   WORK( 1 ), NB, B, LDB,
+     $                   WORK( MN*NB+1 ), INFO )
+*
+            SCLLEN = N
+*
+         ELSE
+*
+*           M < N, A is transposed:
+*           Overdetermined system of equations,
+*           least-squares problem, min || A**T * X - B ||.
+*
+*           Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL DGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB,
+     $                    A, LDA, WORK( 1 ), NB, B, LDB,
+     $                    WORK( MN*NB+1), INFO )
+*
+*           Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS)
+*
+            CALL DTRTRS( 'Lower', 'Transpose', 'Non-unit', M, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+            SCLLEN = M
+*
+         END IF
+*
+      END IF
+*
+*     Undo scaling
+*
+      IF( IASCL.EQ.1 ) THEN
+         CALL DLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      ELSE IF( IASCL.EQ.2 ) THEN
+         CALL DLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      END IF
+      IF( IBSCL.EQ.1 ) THEN
+         CALL DLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      ELSE IF( IBSCL.EQ.2 ) THEN
+         CALL DLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      END IF
+*
+      WORK( 1 ) = DBLE( LWOPT )
+*
+      RETURN
+*
+*     End of DGELST
+*
+      END
diff --git a/lapack-netlib/SRC/sgelst.f b/lapack-netlib/SRC/sgelst.f
new file mode 100644
index 000000000..5377bc720
--- /dev/null
+++ b/lapack-netlib/SRC/sgelst.f
@@ -0,0 +1,531 @@
+*> \brief <b> SGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q.</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGELST + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgelst.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgelst.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgelst.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK,
+*                          INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          TRANS
+*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), B( LDB, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGELST solves overdetermined or underdetermined real linear systems
+*> involving an M-by-N matrix A, or its transpose, using a QR or LQ
+*> factorization of A with compact WY representation of Q.
+*> It is assumed that A has full rank.
+*>
+*> The following options are provided:
+*>
+*> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+*>    an overdetermined system, i.e., solve the least squares problem
+*>                 minimize || B - A*X ||.
+*>
+*> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+*>    an underdetermined system A * X = B.
+*>
+*> 3. If TRANS = 'T' and m >= n:  find the minimum norm solution of
+*>    an underdetermined system A**T * X = B.
+*>
+*> 4. If TRANS = 'T' and m < n:  find the least squares solution of
+*>    an overdetermined system, i.e., solve the least squares problem
+*>                 minimize || B - A**T * X ||.
+*>
+*> Several right hand side vectors b and solution vectors x can be
+*> handled in a single call; they are stored as the columns of the
+*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+*> matrix X.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': the linear system involves A;
+*>          = 'T': the linear system involves A**T.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrices B and X. NRHS >=0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*>          On exit,
+*>            if M >= N, A is overwritten by details of its QR
+*>                       factorization as returned by SGEQRT;
+*>            if M <  N, A is overwritten by details of its LQ
+*>                       factorization as returned by SGELQT.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,NRHS)
+*>          On entry, the matrix B of right hand side vectors, stored
+*>          columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS
+*>          if TRANS = 'T'.
+*>          On exit, if INFO = 0, B is overwritten by the solution
+*>          vectors, stored columnwise:
+*>          if TRANS = 'N' and m >= n, rows 1 to n of B contain the least
+*>          squares solution vectors; the residual sum of squares for the
+*>          solution in each column is given by the sum of squares of
+*>          elements N+1 to M in that column;
+*>          if TRANS = 'N' and m < n, rows 1 to N of B contain the
+*>          minimum norm solution vectors;
+*>          if TRANS = 'T' and m >= n, rows 1 to M of B contain the
+*>          minimum norm solution vectors;
+*>          if TRANS = 'T' and m < n, rows 1 to M of B contain the
+*>          least squares solution vectors; the residual sum of squares
+*>          for the solution in each column is given by the sum of
+*>          squares of elements M+1 to N in that column.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= MAX(1,M,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= max( 1, MN + max( MN, NRHS ) ).
+*>          For optimal performance,
+*>          LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ).
+*>          where MN = min(M,N) and NB is the optimum block size.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO =  i, the i-th diagonal element of the
+*>                triangular factor of A is zero, so that A does not have
+*>                full rank; the least squares solution could not be
+*>                computed.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup realGEsolve
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November 2022,  Igor Kozachenko,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK,
+     $                   INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          TRANS
+      INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), B( LDB, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, TPSD
+      INTEGER            BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS,
+     $                   NB, NBMIN, SCLLEN
+      REAL               ANRM, BIGNUM, BNRM, SMLNUM
+*     ..
+*     .. Local Arrays ..
+      REAL               RWORK( 1 )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      REAL               SLAMCH, SLANGE
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGELQT, SGEQRT, SGEMLQT, SGEMQRT, SLABAD,
+     $                   SLASCL, SLASET, STRTRS, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments.
+*
+      INFO = 0
+      MN = MIN( M, N )
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) ) ) THEN
+         INFO = -1
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY )
+     $          THEN
+         INFO = -10
+      END IF
+*
+*     Figure out optimal block size and optimal workspace size
+*
+      IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN
+*
+         TPSD = .TRUE.
+         IF( LSAME( TRANS, 'N' ) )
+     $      TPSD = .FALSE.
+*
+         NB = ILAENV( 1, 'SGELST', ' ', M, N, -1, -1 )
+*
+         MNNRHS = MAX( MN, NRHS )
+         LWOPT = MAX( 1, (MN+MNNRHS)*NB )
+         WORK( 1 ) = REAL( LWOPT )
+*
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGELST ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB )
+         WORK( 1 ) = REAL( LWOPT )
+         RETURN
+      END IF
+*
+*     *GEQRT and *GELQT routines cannot accept NB larger than min(M,N)
+*
+      IF( NB.GT.MN ) NB = MN
+*
+*     Determine the block size from the supplied LWORK
+*     ( at this stage we know that LWORK >= (minimum required workspace,
+*     but it may be less than optimal)
+*
+      NB = MIN( NB, LWORK/( MN + MNNRHS ) )
+*
+*     The minimum value of NB, when blocked code is used
+*
+      NBMIN = MAX( 2, ILAENV( 2, 'SGELST', ' ', M, N, -1, -1 ) )
+*
+      IF( NB.LT.NBMIN ) THEN
+         NB = 1
+      END IF
+*
+*     Get machine parameters
+*
+      SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
+      BIGNUM = ONE / SMLNUM
+      CALL SLABAD( SMLNUM, BIGNUM )
+*
+*     Scale A, B if max element outside range [SMLNUM,BIGNUM]
+*
+      ANRM = SLANGE( 'M', M, N, A, LDA, RWORK )
+      IASCL = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
+*
+*        Scale matrix norm up to SMLNUM
+*
+         CALL SLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO )
+         IASCL = 1
+      ELSE IF( ANRM.GT.BIGNUM ) THEN
+*
+*        Scale matrix norm down to BIGNUM
+*
+         CALL SLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO )
+         IASCL = 2
+      ELSE IF( ANRM.EQ.ZERO ) THEN
+*
+*        Matrix all zero. Return zero solution.
+*
+         CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB )
+         WORK( 1 ) = REAL( LWOPT )
+         RETURN
+      END IF
+*
+      BROW = M
+      IF( TPSD )
+     $   BROW = N
+      BNRM = SLANGE( 'M', BROW, NRHS, B, LDB, RWORK )
+      IBSCL = 0
+      IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN
+*
+*        Scale matrix norm up to SMLNUM
+*
+         CALL SLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB,
+     $                INFO )
+         IBSCL = 1
+      ELSE IF( BNRM.GT.BIGNUM ) THEN
+*
+*        Scale matrix norm down to BIGNUM
+*
+         CALL SLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB,
+     $                INFO )
+         IBSCL = 2
+      END IF
+*
+      IF( M.GE.N ) THEN
+*
+*        M > N:
+*        Compute the blocked QR factorization of A,
+*        using the compact WY representation of Q,
+*        workspace at least N, optimally N*NB.
+*
+         CALL SGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB,
+     $                WORK( MN*NB+1 ), INFO )
+*
+         IF( .NOT.TPSD ) THEN
+*
+*           M > N, A is not transposed:
+*           Overdetermined system of equations,
+*           least-squares problem, min || A * X - B ||.
+*
+*           Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL SGEMQRT( 'Left', 'Transpose', M, NRHS, N, NB, A, LDA,
+     $                    WORK( 1 ), NB, B, LDB, WORK( MN*NB+1 ),
+     $                    INFO )
+*
+*           Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS)
+*
+            CALL STRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+            SCLLEN = N
+*
+         ELSE
+*
+*           M > N, A is transposed:
+*           Underdetermined system of equations,
+*           minimum norm solution of A**T * X = B.
+*
+*           Compute B := inv(R**T) * B in two row blocks of B.
+*
+*           Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS)
+*
+            CALL STRTRS( 'Upper', 'Transpose', 'Non-unit', N, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+*           Block 2: Zero out all rows below the N-th row in B:
+*           B(N+1:M,1:NRHS) = ZERO
+*
+            DO  J = 1, NRHS
+               DO I = N + 1, M
+                  B( I, J ) = ZERO
+               END DO
+            END DO
+*
+*           Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL SGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB,
+     $                    A, LDA, WORK( 1 ), NB, B, LDB,
+     $                    WORK( MN*NB+1 ), INFO )
+*
+            SCLLEN = M
+*
+         END IF
+*
+      ELSE
+*
+*        M < N:
+*        Compute the blocked LQ factorization of A,
+*        using the compact WY representation of Q,
+*        workspace at least M, optimally M*NB.
+*
+         CALL SGELQT( M, N, NB, A, LDA, WORK( 1 ), NB,
+     $                WORK( MN*NB+1 ), INFO )
+*
+         IF( .NOT.TPSD ) THEN
+*
+*           M < N, A is not transposed:
+*           Underdetermined system of equations,
+*           minimum norm solution of A * X = B.
+*
+*           Compute B := inv(L) * B in two row blocks of B.
+*
+*           Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS)
+*
+            CALL STRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+*           Block 2: Zero out all rows below the M-th row in B:
+*           B(M+1:N,1:NRHS) = ZERO
+*
+            DO J = 1, NRHS
+               DO I = M + 1, N
+                  B( I, J ) = ZERO
+               END DO
+            END DO
+*
+*           Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL SGEMLQT( 'Left', 'Transpose', N, NRHS, M, NB, A, LDA,
+     $                   WORK( 1 ), NB, B, LDB,
+     $                   WORK( MN*NB+1 ), INFO )
+*
+            SCLLEN = N
+*
+         ELSE
+*
+*           M < N, A is transposed:
+*           Overdetermined system of equations,
+*           least-squares problem, min || A**T * X - B ||.
+*
+*           Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL SGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB,
+     $                    A, LDA, WORK( 1 ), NB, B, LDB,
+     $                    WORK( MN*NB+1), INFO )
+*
+*           Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS)
+*
+            CALL STRTRS( 'Lower', 'Transpose', 'Non-unit', M, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+            SCLLEN = M
+*
+         END IF
+*
+      END IF
+*
+*     Undo scaling
+*
+      IF( IASCL.EQ.1 ) THEN
+         CALL SLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      ELSE IF( IASCL.EQ.2 ) THEN
+         CALL SLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      END IF
+      IF( IBSCL.EQ.1 ) THEN
+         CALL SLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      ELSE IF( IBSCL.EQ.2 ) THEN
+         CALL SLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      END IF
+*
+      WORK( 1 ) = REAL( LWOPT )
+*
+      RETURN
+*
+*     End of SGELST
+*
+      END
diff --git a/lapack-netlib/SRC/zgelst.f b/lapack-netlib/SRC/zgelst.f
new file mode 100644
index 000000000..4dabdc91e
--- /dev/null
+++ b/lapack-netlib/SRC/zgelst.f
@@ -0,0 +1,533 @@
+*> \brief <b> ZGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factorization with compact WY representation of Q.</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGELST + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgelst.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgelst.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgelst.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK,
+*                          INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          TRANS
+*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         A( LDA, * ), B( LDB, * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGELST solves overdetermined or underdetermined real linear systems
+*> involving an M-by-N matrix A, or its conjugate-transpose, using a QR
+*> or LQ factorization of A with compact WY representation of Q.
+*> It is assumed that A has full rank.
+*>
+*> The following options are provided:
+*>
+*> 1. If TRANS = 'N' and m >= n:  find the least squares solution of
+*>    an overdetermined system, i.e., solve the least squares problem
+*>                 minimize || B - A*X ||.
+*>
+*> 2. If TRANS = 'N' and m < n:  find the minimum norm solution of
+*>    an underdetermined system A * X = B.
+*>
+*> 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of
+*>    an underdetermined system A**T * X = B.
+*>
+*> 4. If TRANS = 'C' and m < n:  find the least squares solution of
+*>    an overdetermined system, i.e., solve the least squares problem
+*>                 minimize || B - A**T * X ||.
+*>
+*> Several right hand side vectors b and solution vectors x can be
+*> handled in a single call; they are stored as the columns of the
+*> M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+*> matrix X.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          = 'N': the linear system involves A;
+*>          = 'C': the linear system involves A**H.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrices B and X. NRHS >=0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*>          On exit,
+*>            if M >= N, A is overwritten by details of its QR
+*>                       factorization as returned by ZGEQRT;
+*>            if M <  N, A is overwritten by details of its LQ
+*>                       factorization as returned by ZGELQT.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,NRHS)
+*>          On entry, the matrix B of right hand side vectors, stored
+*>          columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS
+*>          if TRANS = 'C'.
+*>          On exit, if INFO = 0, B is overwritten by the solution
+*>          vectors, stored columnwise:
+*>          if TRANS = 'N' and m >= n, rows 1 to n of B contain the least
+*>          squares solution vectors; the residual sum of squares for the
+*>          solution in each column is given by the sum of squares of
+*>          modulus of elements N+1 to M in that column;
+*>          if TRANS = 'N' and m < n, rows 1 to N of B contain the
+*>          minimum norm solution vectors;
+*>          if TRANS = 'C' and m >= n, rows 1 to M of B contain the
+*>          minimum norm solution vectors;
+*>          if TRANS = 'C' and m < n, rows 1 to M of B contain the
+*>          least squares solution vectors; the residual sum of squares
+*>          for the solution in each column is given by the sum of
+*>          squares of the modulus of elements M+1 to N in that column.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B. LDB >= MAX(1,M,N).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= max( 1, MN + max( MN, NRHS ) ).
+*>          For optimal performance,
+*>          LWORK >= max( 1, (MN + max( MN, NRHS ))*NB ).
+*>          where MN = min(M,N) and NB is the optimum block size.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the optimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO =  i, the i-th diagonal element of the
+*>                triangular factor of A is zero, so that A does not have
+*>                full rank; the least squares solution could not be
+*>                computed.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16GEsolve
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November 2022,  Igor Kozachenko,
+*>                  Computer Science Division,
+*>                  University of California, Berkeley
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK,
+     $                   INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          TRANS
+      INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), B( LDB, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      COMPLEX*16         CZERO
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, TPSD
+      INTEGER            BROW, I, IASCL, IBSCL, J, LWOPT, MN, MNNRHS,
+     $                   NB, NBMIN, SCLLEN
+      DOUBLE PRECISION   ANRM, BIGNUM, BNRM, SMLNUM
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   RWORK( 1 )
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      INTEGER            ILAENV
+      DOUBLE PRECISION   DLAMCH, ZLANGE
+      EXTERNAL           LSAME, ILAENV, DLAMCH, ZLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZGELQT, ZGEQRT, ZGEMLQT, ZGEMQRT, DLABAD,
+     $                   ZLASCL, ZLASET, ZTRTRS, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments.
+*
+      INFO = 0
+      MN = MIN( M, N )
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( .NOT.( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'C' ) ) ) THEN
+         INFO = -1
+      ELSE IF( M.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDB.LT.MAX( 1, M, N ) ) THEN
+         INFO = -8
+      ELSE IF( LWORK.LT.MAX( 1, MN+MAX( MN, NRHS ) ) .AND. .NOT.LQUERY )
+     $          THEN
+         INFO = -10
+      END IF
+*
+*     Figure out optimal block size and optimal workspace size
+*
+      IF( INFO.EQ.0 .OR. INFO.EQ.-10 ) THEN
+*
+         TPSD = .TRUE.
+         IF( LSAME( TRANS, 'N' ) )
+     $      TPSD = .FALSE.
+*
+         NB = ILAENV( 1, 'ZGELST', ' ', M, N, -1, -1 )
+*
+         MNNRHS = MAX( MN, NRHS )
+         LWOPT = MAX( 1, (MN+MNNRHS)*NB )
+         WORK( 1 ) = DBLE( LWOPT )
+*
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGELST ', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         CALL ZLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB )
+         WORK( 1 ) = DBLE( LWOPT )
+         RETURN
+      END IF
+*
+*     *GEQRT and *GELQT routines cannot accept NB larger than min(M,N)
+*
+      IF( NB.GT.MN ) NB = MN
+*
+*     Determine the block size from the supplied LWORK
+*     ( at this stage we know that LWORK >= (minimum required workspace,
+*     but it may be less than optimal)
+*
+      NB = MIN( NB, LWORK/( MN + MNNRHS ) )
+*
+*     The minimum value of NB, when blocked code is used
+*
+      NBMIN = MAX( 2, ILAENV( 2, 'ZGELST', ' ', M, N, -1, -1 ) )
+*
+      IF( NB.LT.NBMIN ) THEN
+         NB = 1
+      END IF
+*
+*     Get machine parameters
+*
+      SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
+      BIGNUM = ONE / SMLNUM
+      CALL DLABAD( SMLNUM, BIGNUM )
+*
+*     Scale A, B if max element outside range [SMLNUM,BIGNUM]
+*
+      ANRM = ZLANGE( 'M', M, N, A, LDA, RWORK )
+      IASCL = 0
+      IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN
+*
+*        Scale matrix norm up to SMLNUM
+*
+         CALL ZLASCL( 'G', 0, 0, ANRM, SMLNUM, M, N, A, LDA, INFO )
+         IASCL = 1
+      ELSE IF( ANRM.GT.BIGNUM ) THEN
+*
+*        Scale matrix norm down to BIGNUM
+*
+         CALL ZLASCL( 'G', 0, 0, ANRM, BIGNUM, M, N, A, LDA, INFO )
+         IASCL = 2
+      ELSE IF( ANRM.EQ.ZERO ) THEN
+*
+*        Matrix all zero. Return zero solution.
+*
+         CALL ZLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB )
+         WORK( 1 ) = DBLE( LWOPT )
+         RETURN
+      END IF
+*
+      BROW = M
+      IF( TPSD )
+     $   BROW = N
+      BNRM = ZLANGE( 'M', BROW, NRHS, B, LDB, RWORK )
+      IBSCL = 0
+      IF( BNRM.GT.ZERO .AND. BNRM.LT.SMLNUM ) THEN
+*
+*        Scale matrix norm up to SMLNUM
+*
+         CALL ZLASCL( 'G', 0, 0, BNRM, SMLNUM, BROW, NRHS, B, LDB,
+     $                INFO )
+         IBSCL = 1
+      ELSE IF( BNRM.GT.BIGNUM ) THEN
+*
+*        Scale matrix norm down to BIGNUM
+*
+         CALL ZLASCL( 'G', 0, 0, BNRM, BIGNUM, BROW, NRHS, B, LDB,
+     $                INFO )
+         IBSCL = 2
+      END IF
+*
+      IF( M.GE.N ) THEN
+*
+*        M > N:
+*        Compute the blocked QR factorization of A,
+*        using the compact WY representation of Q,
+*        workspace at least N, optimally N*NB.
+*
+         CALL ZGEQRT( M, N, NB, A, LDA, WORK( 1 ), NB,
+     $                WORK( MN*NB+1 ), INFO )
+*
+         IF( .NOT.TPSD ) THEN
+*
+*           M > N, A is not transposed:
+*           Overdetermined system of equations,
+*           least-squares problem, min || A * X - B ||.
+*
+*           Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL ZGEMQRT( 'Left', 'Conjugate transpose', M, NRHS, N, NB,
+     $                    A, LDA, WORK( 1 ), NB, B, LDB,
+     $                    WORK( MN*NB+1 ), INFO )
+*
+*           Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS)
+*
+            CALL ZTRTRS( 'Upper', 'No transpose', 'Non-unit', N, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+            SCLLEN = N
+*
+         ELSE
+*
+*           M > N, A is transposed:
+*           Underdetermined system of equations,
+*           minimum norm solution of A**T * X = B.
+*
+*           Compute B := inv(R**T) * B in two row blocks of B.
+*
+*           Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS)
+*
+            CALL ZTRTRS( 'Upper', 'Conjugate transpose', 'Non-unit',
+     $                   N, NRHS, A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+*           Block 2: Zero out all rows below the N-th row in B:
+*           B(N+1:M,1:NRHS) = ZERO
+*
+            DO  J = 1, NRHS
+               DO I = N + 1, M
+                  B( I, J ) = ZERO
+               END DO
+            END DO
+*
+*           Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL ZGEMQRT( 'Left', 'No transpose', M, NRHS, N, NB,
+     $                    A, LDA, WORK( 1 ), NB, B, LDB,
+     $                    WORK( MN*NB+1 ), INFO )
+*
+            SCLLEN = M
+*
+         END IF
+*
+      ELSE
+*
+*        M < N:
+*        Compute the blocked LQ factorization of A,
+*        using the compact WY representation of Q,
+*        workspace at least M, optimally M*NB.
+*
+         CALL ZGELQT( M, N, NB, A, LDA, WORK( 1 ), NB,
+     $                WORK( MN*NB+1 ), INFO )
+*
+         IF( .NOT.TPSD ) THEN
+*
+*           M < N, A is not transposed:
+*           Underdetermined system of equations,
+*           minimum norm solution of A * X = B.
+*
+*           Compute B := inv(L) * B in two row blocks of B.
+*
+*           Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS)
+*
+            CALL ZTRTRS( 'Lower', 'No transpose', 'Non-unit', M, NRHS,
+     $                   A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+*           Block 2: Zero out all rows below the M-th row in B:
+*           B(M+1:N,1:NRHS) = ZERO
+*
+            DO J = 1, NRHS
+               DO I = M + 1, N
+                  B( I, J ) = ZERO
+               END DO
+            END DO
+*
+*           Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL ZGEMLQT( 'Left', 'Conjugate transpose', N, NRHS, M, NB,
+     $                   A, LDA, WORK( 1 ), NB, B, LDB,
+     $                   WORK( MN*NB+1 ), INFO )
+*
+            SCLLEN = N
+*
+         ELSE
+*
+*           M < N, A is transposed:
+*           Overdetermined system of equations,
+*           least-squares problem, min || A**T * X - B ||.
+*
+*           Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS),
+*           using the compact WY representation of Q,
+*           workspace at least NRHS, optimally NRHS*NB.
+*
+            CALL ZGEMLQT( 'Left', 'No transpose', N, NRHS, M, NB,
+     $                    A, LDA, WORK( 1 ), NB, B, LDB,
+     $                    WORK( MN*NB+1), INFO )
+*
+*           Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS)
+*
+            CALL ZTRTRS( 'Lower', 'Conjugate transpose', 'Non-unit',
+     $                   M, NRHS, A, LDA, B, LDB, INFO )
+*
+            IF( INFO.GT.0 ) THEN
+               RETURN
+            END IF
+*
+            SCLLEN = M
+*
+         END IF
+*
+      END IF
+*
+*     Undo scaling
+*
+      IF( IASCL.EQ.1 ) THEN
+         CALL ZLASCL( 'G', 0, 0, ANRM, SMLNUM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      ELSE IF( IASCL.EQ.2 ) THEN
+         CALL ZLASCL( 'G', 0, 0, ANRM, BIGNUM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      END IF
+      IF( IBSCL.EQ.1 ) THEN
+         CALL ZLASCL( 'G', 0, 0, SMLNUM, BNRM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      ELSE IF( IBSCL.EQ.2 ) THEN
+         CALL ZLASCL( 'G', 0, 0, BIGNUM, BNRM, SCLLEN, NRHS, B, LDB,
+     $                INFO )
+      END IF
+*
+      WORK( 1 ) = DBLE( LWOPT )
+*
+      RETURN
+*
+*     End of ZGELST
+*
+      END

From 1497336b203a7efa09cc788099a79d7732662fdf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 22:39:16 +0100
Subject: [PATCH 115/154] Add tests for ?GELST (Reference-LAPACK PR739)

---
 lapack-netlib/TESTING/LIN/alahd.f  |  22 +-
 lapack-netlib/TESTING/LIN/cdrvls.f | 320 ++++++++++++++++++++-------
 lapack-netlib/TESTING/LIN/cerrls.f |  61 +++++-
 lapack-netlib/TESTING/LIN/ddrvls.f | 339 +++++++++++++++++++++--------
 lapack-netlib/TESTING/LIN/derrls.f |  61 +++++-
 lapack-netlib/TESTING/LIN/sdrvls.f | 333 ++++++++++++++++++++--------
 lapack-netlib/TESTING/LIN/serrls.f |  61 +++++-
 lapack-netlib/TESTING/LIN/zdrvls.f | 333 ++++++++++++++++++++--------
 lapack-netlib/TESTING/LIN/zerrls.f |  61 +++++-
 9 files changed, 1234 insertions(+), 357 deletions(-)

diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f
index 2cc0fba06..f0423a23b 100644
--- a/lapack-netlib/TESTING/LIN/alahd.f
+++ b/lapack-netlib/TESTING/LIN/alahd.f
@@ -608,17 +608,18 @@
       ELSE IF( LSAMEN( 2, P2, 'LS' ) ) THEN
 *
 *        LS:  Least Squares driver routines for
-*             LS, LSD, LSS, LSX and LSY.
+*             LS, LST, TSLS, LSD, LSS, LSX and LSY.
 *
          WRITE( IOUNIT, FMT = 9984 )PATH
          WRITE( IOUNIT, FMT = 9967 )
-         WRITE( IOUNIT, FMT = 9921 )C1, C1, C1, C1
+         WRITE( IOUNIT, FMT = 9921 )C1, C1, C1, C1, C1, C1
          WRITE( IOUNIT, FMT = 9935 )1
          WRITE( IOUNIT, FMT = 9931 )2
-         WRITE( IOUNIT, FMT = 9933 )3
-         WRITE( IOUNIT, FMT = 9935 )4
-         WRITE( IOUNIT, FMT = 9934 )5
-         WRITE( IOUNIT, FMT = 9932 )6
+         WRITE( IOUNIT, FMT = 9919 )
+         WRITE( IOUNIT, FMT = 9933 )7
+         WRITE( IOUNIT, FMT = 9935 )8
+         WRITE( IOUNIT, FMT = 9934 )9
+         WRITE( IOUNIT, FMT = 9932 )10
          WRITE( IOUNIT, FMT = 9920 )
          WRITE( IOUNIT, FMT = '( '' Messages:'' )' )
 *
@@ -1048,10 +1049,11 @@
      $      'check if X is in the row space of A or A'' ',
      $      '(overdetermined case)' )
  9929 FORMAT( ' Test ratios (1-3: ', A1, 'TZRZF):' )
- 9920 FORMAT( 3X, ' 7-10: same as 3-6', 3X, ' 11-14: same as 3-6' )
- 9921 FORMAT( ' Test ratios:', / '    (1-2: ', A1, 'GELS, 3-6: ', A1,
-     $      'GELSY, 7-10: ', A1, 'GELSS, 11-14: ', A1, 'GELSD, 15-16: ',
-     $        A1, 'GETSLS)')
+ 9919 FORMAT( 3X, ' 3-4: same as 1-2', 3X, ' 5-6: same as 1-2' )
+ 9920 FORMAT( 3X, ' 11-14: same as 7-10', 3X, ' 15-18: same as 7-10' )
+ 9921 FORMAT( ' Test ratios:', / '    (1-2: ', A1, 'GELS, 3-4: ', A1,
+     $      'GELST, 5-6: ', A1, 'GETSLS, 7-10: ', A1, 'GELSY, 11-14: ',
+     $        A1, 'GETSS, 15-18: ', A1, 'GELSD)' )
  9928 FORMAT( 7X, 'where ALPHA = ( 1 + SQRT( 17 ) ) / 8' )
  9927 FORMAT( 3X, I2, ': ABS( Largest element in L )', / 12X,
      $      ' - ( 1 / ( 1 - ALPHA ) ) + THRESH' )
diff --git a/lapack-netlib/TESTING/LIN/cdrvls.f b/lapack-netlib/TESTING/LIN/cdrvls.f
index 7fe189e5f..ecba705d5 100644
--- a/lapack-netlib/TESTING/LIN/cdrvls.f
+++ b/lapack-netlib/TESTING/LIN/cdrvls.f
@@ -31,7 +31,8 @@
 *>
 *> \verbatim
 *>
-*> CDRVLS tests the least squares driver routines CGELS, CGETSLS, CGELSS, CGELSY
+*> CDRVLS tests the least squares driver routines CGELS, CGELST,
+*> CGETSLS, CGELSS, CGELSY
 *> and CGELSD.
 *> \endverbatim
 *
@@ -211,7 +212,7 @@
 *
 *     .. Parameters ..
       INTEGER            NTESTS
-      PARAMETER          ( NTESTS = 16 )
+      PARAMETER          ( NTESTS = 18 )
       INTEGER            SMLSIZ
       PARAMETER          ( SMLSIZ = 25 )
       REAL               ONE, ZERO
@@ -228,8 +229,8 @@
      $                   LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS,
      $                   NFAIL, NRHS, NROWS, NRUN, RANK, MB,
      $                   MMAX, NMAX, NSMAX, LIWORK, LRWORK,
-     $                   LWORK_CGELS, LWORK_CGETSLS, LWORK_CGELSS,
-     $                   LWORK_CGELSY, LWORK_CGELSD,
+     $                   LWORK_CGELS, LWORK_CGELST, LWORK_CGETSLS,
+     $                   LWORK_CGELSS, LWORK_CGELSY,  LWORK_CGELSD,
      $                   LRWORK_CGELSY, LRWORK_CGELSS, LRWORK_CGELSD
       REAL               EPS, NORMA, NORMB, RCOND
 *     ..
@@ -249,7 +250,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAERH, ALAHD, ALASVM, CERRLS, CGELS, CGELSD,
-     $                   CGELSS, CGELSY, CGEMM, CGETSLS, CLACPY,
+     $                   CGELSS, CGELST, CGELSY, CGEMM, CGETSLS, CLACPY,
      $                   CLARNV, CQRT13, CQRT15, CQRT16, CSSCAL,
      $                   SAXPY, XLAENV
 *     ..
@@ -334,7 +335,8 @@
       LIWORK = 1
 *
 *     Iterate through all test cases and compute necessary workspace
-*     sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines.
+*     sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD
+*     routines.
 *
       DO IM = 1, NM
          M = MVAL( IM )
@@ -361,6 +363,10 @@
                               CALL CGELS( TRANS, M, N, NRHS, A, LDA,
      $                                    B, LDB, WQ, -1, INFO )
                               LWORK_CGELS = INT( WQ( 1 ) )
+*                             Compute workspace needed for CGELST
+                              CALL CGELST( TRANS, M, N, NRHS, A, LDA,
+     $                                    B, LDB, WQ, -1, INFO )
+                              LWORK_CGELST = INT ( WQ ( 1 ) )
 *                             Compute workspace needed for CGETSLS
                               CALL CGETSLS( TRANS, M, N, NRHS, A, LDA,
      $                                      B, LDB, WQ, -1, INFO )
@@ -425,21 +431,26 @@
                      ITYPE = ( IRANK-1 )*3 + ISCALE
                      IF( .NOT.DOTYPE( ITYPE ) )
      $                  GO TO 100
-*
+*                 =====================================================
+*                       Begin test CGELS
+*                 =====================================================
                      IF( IRANK.EQ.1 ) THEN
 *
-*                       Test CGELS
-*
 *                       Generate a matrix of scaling type ISCALE
 *
                         CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
      $                               ISEED )
-                        DO 40 INB = 1, NNB
+*
+*                       Loop for testing different block sizes.
+*
+                        DO INB = 1, NNB
                            NB = NBVAL( INB )
                            CALL XLAENV( 1, NB )
                            CALL XLAENV( 3, NXVAL( INB ) )
 *
-                           DO 30 ITRAN = 1, 2
+*                          Loop for testing non-transposed and transposed.
+*
+                           DO ITRAN = 1, 2
                               IF( ITRAN.EQ.1 ) THEN
                                  TRANS = 'N'
                                  NROWS = M
@@ -484,15 +495,20 @@
      $                                        ITYPE, NFAIL, NERRS,
      $                                        NOUT )
 *
-*                             Check correctness of results
+*                             Test 1: Check correctness of results
+*                             for CGELS, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
 *
-                              LDWORK = MAX( 1, NROWS )
                               IF( NROWS.GT.0 .AND. NRHS.GT.0 )
      $                           CALL CLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, C, LDB )
                               CALL CQRT16( TRANS, M, N, NRHS, COPYA,
      $                                     LDA, B, LDB, C, LDB, RWORK,
      $                                     RESULT( 1 ) )
+*
+*                             Test 2: Check correctness of results
+*                             for CGELS.
 *
                               IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
      $                            ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
@@ -515,7 +531,7 @@
 *                             Print information about the tests that
 *                             did not pass the threshold.
 *
-                              DO 20 K = 1, 2
+                              DO K = 1, 2
                                  IF( RESULT( K ).GE.THRESH ) THEN
                                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                                 CALL ALAHD( NOUT, PATH )
@@ -524,26 +540,34 @@
      $                                 RESULT( K )
                                     NFAIL = NFAIL + 1
                                  END IF
-   20                         CONTINUE
+                              END DO
                               NRUN = NRUN + 2
-   30                      CONTINUE
-   40                   CONTINUE
-*
-*
-*                       Test CGETSLS
+                           END DO
+                        END DO
+                     END IF
+*                 =====================================================
+*                       End test CGELS
+*                 =====================================================
+*                 =====================================================
+*                       Begin test CGELST
+*                 =====================================================
+                     IF( IRANK.EQ.1 ) THEN
 *
 *                       Generate a matrix of scaling type ISCALE
 *
                         CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
      $                               ISEED )
-                        DO 65 INB = 1, NNB
-                             MB = NBVAL( INB )
-                             CALL XLAENV( 1, MB )
-                             DO 62 IMB = 1, NNB
-                              NB = NBVAL( IMB )
-                              CALL XLAENV( 2, NB )
 *
-                           DO 60 ITRAN = 1, 2
+*                       Loop for testing different block sizes.
+*
+                        DO INB = 1, NNB
+                           NB = NBVAL( INB )
+                           CALL XLAENV( 1, NB )
+                           CALL XLAENV( 3, NXVAL( INB ) )
+*
+*                          Loop for testing non-transposed and transposed.
+*
+                           DO ITRAN = 1, 2
                               IF( ITRAN.EQ.1 ) THEN
                                  TRANS = 'N'
                                  NROWS = M
@@ -560,9 +584,9 @@
                               IF( NCOLS.GT.0 ) THEN
                                  CALL CLARNV( 2, ISEED, NCOLS*NRHS,
      $                                        WORK )
-                                 CALL CSCAL( NCOLS*NRHS,
-     $                                       CONE / REAL( NCOLS ), WORK,
-     $                                       1 )
+                                 CALL CSSCAL( NCOLS*NRHS,
+     $                                        ONE / REAL( NCOLS ), WORK,
+     $                                        1 )
                               END IF
                               CALL CGEMM( TRANS, 'No transpose', NROWS,
      $                                    NRHS, NCOLS, CONE, COPYA, LDA,
@@ -578,31 +602,37 @@
                                  CALL CLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, B, LDB )
                               END IF
-                              SRNAMT = 'CGETSLS '
-                              CALL CGETSLS( TRANS, M, N, NRHS, A,
-     $                                 LDA, B, LDB, WORK, LWORK, INFO )
+                              SRNAMT = 'CGELST'
+                              CALL CGELST( TRANS, M, N, NRHS, A, LDA, B,
+     $                                    LDB, WORK, LWORK, INFO )
+*
                               IF( INFO.NE.0 )
-     $                           CALL ALAERH( PATH, 'CGETSLS ', INFO, 0,
+     $                           CALL ALAERH( PATH, 'CGELST', INFO, 0,
      $                                        TRANS, M, N, NRHS, -1, NB,
      $                                        ITYPE, NFAIL, NERRS,
      $                                        NOUT )
 *
-*                             Check correctness of results
+*                             Test 3: Check correctness of results
+*                             for CGELST, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
 *
-                              LDWORK = MAX( 1, NROWS )
                               IF( NROWS.GT.0 .AND. NRHS.GT.0 )
      $                           CALL CLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, C, LDB )
                               CALL CQRT16( TRANS, M, N, NRHS, COPYA,
-     $                                     LDA, B, LDB, C, LDB, WORK2,
-     $                                     RESULT( 15 ) )
+     $                                     LDA, B, LDB, C, LDB, RWORK,
+     $                                     RESULT( 3 ) )
+*
+*                             Test 4: Check correctness of results
+*                             for CGELST.
 *
                               IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
      $                            ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
 *
 *                                Solving LS system
 *
-                                 RESULT( 16 ) = CQRT17( TRANS, 1, M, N,
+                                 RESULT( 4 ) = CQRT17( TRANS, 1, M, N,
      $                                         NRHS, COPYA, LDA, B, LDB,
      $                                         COPYB, LDB, C, WORK,
      $                                         LWORK )
@@ -610,7 +640,7 @@
 *
 *                                Solving overdetermined system
 *
-                                 RESULT( 16 ) = CQRT14( TRANS, M, N,
+                                 RESULT( 4 ) = CQRT14( TRANS, M, N,
      $                                         NRHS, COPYA, LDA, B, LDB,
      $                                         WORK, LWORK )
                               END IF
@@ -618,21 +648,151 @@
 *                             Print information about the tests that
 *                             did not pass the threshold.
 *
-                              DO 50 K = 15, 16
+                              DO K = 3, 4
                                  IF( RESULT( K ).GE.THRESH ) THEN
                                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                                 CALL ALAHD( NOUT, PATH )
-                                    WRITE( NOUT, FMT = 9997 )TRANS, M,
-     $                                 N, NRHS, MB, NB, ITYPE, K,
+                                    WRITE( NOUT, FMT = 9999 )TRANS, M,
+     $                                 N, NRHS, NB, ITYPE, K,
      $                                 RESULT( K )
                                     NFAIL = NFAIL + 1
                                  END IF
-   50                         CONTINUE
+                              END DO
                               NRUN = NRUN + 2
-   60                      CONTINUE
-   62                      CONTINUE
-   65                   CONTINUE
+                           END DO
+                        END DO
+                     END IF
+*                 =====================================================
+*                       End test CGELST
+*                 =====================================================
+*                 =====================================================
+*                       Begin test CGELSTSLS
+*                 =====================================================
+                     IF( IRANK.EQ.1 ) THEN
+*
+*                       Generate a matrix of scaling type ISCALE
+*
+                        CALL CQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
+     $                               ISEED )
+*
+*                       Loop for testing different block sizes MB.
+*
+                        DO INB = 1, NNB
+                           MB = NBVAL( INB )
+                           CALL XLAENV( 1, MB )
+*
+*                          Loop for testing different block sizes NB.
+*
+                           DO IMB = 1, NNB
+                              NB = NBVAL( IMB )
+                              CALL XLAENV( 2, NB )
+*
+*                             Loop for testing non-transposed
+*                             and transposed.
+*
+                              DO ITRAN = 1, 2
+                                 IF( ITRAN.EQ.1 ) THEN
+                                    TRANS = 'N'
+                                    NROWS = M
+                                    NCOLS = N
+                                 ELSE
+                                    TRANS = 'C'
+                                    NROWS = N
+                                    NCOLS = M
+                                 END IF
+                                 LDWORK = MAX( 1, NCOLS )
+*
+*                                Set up a consistent rhs
+*
+                                 IF( NCOLS.GT.0 ) THEN
+                                    CALL CLARNV( 2, ISEED, NCOLS*NRHS,
+     $                                           WORK )
+                                    CALL CSCAL( NCOLS*NRHS,
+     $                                          CONE / REAL( NCOLS ),
+     $                                          WORK, 1 )
+                                 END IF
+                                 CALL CGEMM( TRANS, 'No transpose',
+     $                                       NROWS, NRHS, NCOLS, CONE,
+     $                                       COPYA, LDA, WORK, LDWORK,
+     $                                       CZERO, B, LDB )
+                                 CALL CLACPY( 'Full', NROWS, NRHS,
+     $                                        B, LDB, COPYB, LDB )
+*
+*                                Solve LS or overdetermined system
+*
+                                 IF( M.GT.0 .AND. N.GT.0 ) THEN
+                                    CALL CLACPY( 'Full', M, N,
+     $                                           COPYA, LDA, A, LDA )
+                                    CALL CLACPY( 'Full', NROWS, NRHS,
+     $                                           COPYB, LDB, B, LDB )
+                                 END IF
+                                 SRNAMT = 'CGETSLS '
+                                 CALL CGETSLS( TRANS, M, N, NRHS, A,
+     $                                    LDA, B, LDB, WORK, LWORK,
+     $                                    INFO )
+                                 IF( INFO.NE.0 )
+     $                              CALL ALAERH( PATH, 'CGETSLS ', INFO,
+     $                                           0, TRANS, M, N, NRHS,
+     $                                           -1, NB, ITYPE, NFAIL,
+     $                                           NERRS, NOUT )
+*
+*                             Test 5: Check correctness of results
+*                             for CGETSLS, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
+*
+                                 IF( NROWS.GT.0 .AND. NRHS.GT.0 )
+     $                              CALL CLACPY( 'Full', NROWS, NRHS,
+     $                                           COPYB, LDB, C, LDB )
+                                 CALL CQRT16( TRANS, M, N, NRHS,
+     $                                        COPYA, LDA, B, LDB,
+     $                                        C, LDB, WORK2,
+     $                                        RESULT( 5 ) )
+*
+*                             Test 6: Check correctness of results
+*                             for CGETSLS.
+*
+                                 IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
+     $                               ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
+*
+*                                   Solving LS system, compute:
+*                                   r = norm((B- A*X)**T * A) /
+*                                 / (norm(A)*norm(B)*max(M,N,NRHS)*EPS)
+*
+                                    RESULT( 6 ) = CQRT17( TRANS, 1, M,
+     $                                             N, NRHS, COPYA, LDA,
+     $                                             B, LDB, COPYB, LDB,
+     $                                             C, WORK, LWORK )
+                                 ELSE
+*
+*                                   Solving overdetermined system
+*
+                                    RESULT( 6 ) = CQRT14( TRANS, M, N,
+     $                                             NRHS, COPYA, LDA, B,
+     $                                             LDB, WORK, LWORK )
+                                 END IF
+*
+*                                Print information about the tests that
+*                                did not pass the threshold.
+*
+                                 DO K = 5, 6
+                                    IF( RESULT( K ).GE.THRESH ) THEN
+                                       IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                                    CALL ALAHD( NOUT, PATH )
+                                       WRITE( NOUT, FMT = 9997 )TRANS,
+     $                                    M, N, NRHS, MB, NB, ITYPE, K,
+     $                                    RESULT( K )
+                                          NFAIL = NFAIL + 1
+                                    END IF
+                                 END DO
+                                 NRUN = NRUN + 2
+                              END DO
+                           END DO
+                        END DO
                      END IF
+*                 =====================================================
+*                       End test CGELSTSLS
+*                 ====================================================
 *
 *                    Generate a matrix of scaling type ISCALE and rank
 *                    type IRANK.
@@ -680,37 +840,37 @@
 *
 *                       workspace used: 2*MNMIN+NB*NB+NB*MAX(N,NRHS)
 *
-*                       Test 3:  Compute relative error in svd
+*                       Test 7:  Compute relative error in svd
 *                                workspace: M*N + 4*MIN(M,N) + MAX(M,N)
 *
-                        RESULT( 3 ) = CQRT12( CRANK, CRANK, A, LDA,
+                        RESULT( 7 ) = CQRT12( CRANK, CRANK, A, LDA,
      $                                COPYS, WORK, LWORK, RWORK )
 *
-*                       Test 4:  Compute error in solution
+*                       Test 8:  Compute error in solution
 *                                workspace:  M*NRHS + M
 *
                         CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL CQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK, RWORK,
-     $                               RESULT( 4 ) )
+     $                               RESULT( 8 ) )
 *
-*                       Test 5:  Check norm of r'*A
+*                       Test 9:  Check norm of r'*A
 *                                workspace: NRHS*(M+N)
 *
-                        RESULT( 5 ) = ZERO
+                        RESULT( 9 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 5 ) = CQRT17( 'No transpose', 1, M,
+     $                     RESULT( 9 ) = CQRT17( 'No transpose', 1, M,
      $                                   N, NRHS, COPYA, LDA, B, LDB,
      $                                   COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 6:  Check if x is in the rowspace of A
+*                       Test 10:  Check if x is in the rowspace of A
 *                                workspace: (M+NRHS)*(N+2)
 *
-                        RESULT( 6 ) = ZERO
+                        RESULT( 10 ) = ZERO
 *
                         IF( N.GT.CRANK )
-     $                     RESULT( 6 ) = CQRT14( 'No transpose', M, N,
+     $                     RESULT( 10 ) = CQRT14( 'No transpose', M, N,
      $                                   NRHS, COPYA, LDA, B, LDB,
      $                                   WORK, LWORK )
 *
@@ -736,38 +896,38 @@
 *                       workspace used: 3*min(m,n) +
 *                                       max(2*min(m,n),nrhs,max(m,n))
 *
-*                       Test 7:  Compute relative error in svd
+*                       Test 11:  Compute relative error in svd
 *
                         IF( RANK.GT.0 ) THEN
                            CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 )
-                           RESULT( 7 ) = SASUM( MNMIN, S, 1 ) /
+                           RESULT( 11 ) = SASUM( MNMIN, S, 1 ) /
      $                                    SASUM( MNMIN, COPYS, 1 ) /
      $                                    ( EPS*REAL( MNMIN ) )
                         ELSE
-                           RESULT( 7 ) = ZERO
+                           RESULT( 11 ) = ZERO
                         END IF
 *
-*                       Test 8:  Compute error in solution
+*                       Test 12:  Compute error in solution
 *
                         CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL CQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK, RWORK,
-     $                               RESULT( 8 ) )
+     $                               RESULT( 12 ) )
 *
-*                       Test 9:  Check norm of r'*A
+*                       Test 13:  Check norm of r'*A
 *
-                        RESULT( 9 ) = ZERO
+                        RESULT( 13 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 9 ) = CQRT17( 'No transpose', 1, M,
+     $                     RESULT( 13 ) = CQRT17( 'No transpose', 1, M,
      $                                    N, NRHS, COPYA, LDA, B, LDB,
      $                                    COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 10:  Check if x is in the rowspace of A
+*                       Test 14:  Check if x is in the rowspace of A
 *
-                        RESULT( 10 ) = ZERO
+                        RESULT( 14 ) = ZERO
                         IF( N.GT.CRANK )
-     $                     RESULT( 10 ) = CQRT14( 'No transpose', M, N,
+     $                     RESULT( 14 ) = CQRT14( 'No transpose', M, N,
      $                                    NRHS, COPYA, LDA, B, LDB,
      $                                    WORK, LWORK )
 *
@@ -792,45 +952,45 @@
      $                                  N, NRHS, -1, NB, ITYPE, NFAIL,
      $                                  NERRS, NOUT )
 *
-*                       Test 11:  Compute relative error in svd
+*                       Test 15:  Compute relative error in svd
 *
                         IF( RANK.GT.0 ) THEN
                            CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 )
-                           RESULT( 11 ) = SASUM( MNMIN, S, 1 ) /
+                           RESULT( 15 ) = SASUM( MNMIN, S, 1 ) /
      $                                    SASUM( MNMIN, COPYS, 1 ) /
      $                                    ( EPS*REAL( MNMIN ) )
                         ELSE
-                           RESULT( 11 ) = ZERO
+                           RESULT( 15 ) = ZERO
                         END IF
 *
-*                       Test 12:  Compute error in solution
+*                       Test 16:  Compute error in solution
 *
                         CALL CLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL CQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK, RWORK,
-     $                               RESULT( 12 ) )
+     $                               RESULT( 16 ) )
 *
-*                       Test 13:  Check norm of r'*A
+*                       Test 17:  Check norm of r'*A
 *
-                        RESULT( 13 ) = ZERO
+                        RESULT( 17 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 13 ) = CQRT17( 'No transpose', 1, M,
+     $                     RESULT( 17 ) = CQRT17( 'No transpose', 1, M,
      $                                    N, NRHS, COPYA, LDA, B, LDB,
      $                                    COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 14:  Check if x is in the rowspace of A
+*                       Test 18:  Check if x is in the rowspace of A
 *
-                        RESULT( 14 ) = ZERO
+                        RESULT( 18 ) = ZERO
                         IF( N.GT.CRANK )
-     $                     RESULT( 14 ) = CQRT14( 'No transpose', M, N,
+     $                     RESULT( 18 ) = CQRT14( 'No transpose', M, N,
      $                                    NRHS, COPYA, LDA, B, LDB,
      $                                    WORK, LWORK )
 *
 *                       Print information about the tests that did not
 *                       pass the threshold.
 *
-                        DO 80 K = 3, 14
+                        DO 80 K = 7, 18
                            IF( RESULT( K ).GE.THRESH ) THEN
                               IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                           CALL ALAHD( NOUT, PATH )
diff --git a/lapack-netlib/TESTING/LIN/cerrls.f b/lapack-netlib/TESTING/LIN/cerrls.f
index 48e44ad86..fca943918 100644
--- a/lapack-netlib/TESTING/LIN/cerrls.f
+++ b/lapack-netlib/TESTING/LIN/cerrls.f
@@ -22,7 +22,7 @@
 *> \verbatim
 *>
 *> CERRLS tests the error exits for the COMPLEX least squares
-*> driver routines (CGELS, CGELSS, CGELSY, CGELSD).
+*> driver routines (CGELS, CGELST, CGETSLS, CGELSS, CGELSY, CGELSD).
 *> \endverbatim
 *
 *  Arguments:
@@ -83,7 +83,8 @@
       EXTERNAL           LSAMEN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CGELS, CGELSD, CGELSS, CGELSY, CHKXER
+      EXTERNAL           ALAESM, CHKXER, CGELS, CGELSD, CGELSS, CGELST,
+     $                   CGELSY, CGETSLS
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -130,10 +131,66 @@
          INFOT = 8
          CALL CGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
          CALL CHKXER( 'CGELS ', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'CGELS', INFOT, NOUT, LERR, OK )
          INFOT = 10
          CALL CGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO )
          CALL CHKXER( 'CGELS ', INFOT, NOUT, LERR, OK )
 *
+*        CGELST
+*
+         SRNAMT = 'CGELST'
+         INFOT = 1
+         CALL CGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL CGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO )
+         CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
+         CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CGELST', INFOT, NOUT, LERR, OK )
+*
+*        CGETSLS
+*
+         SRNAMT = 'CGETSLS'
+         INFOT = 1
+         CALL CGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL CGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO )
+         CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
+         CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'CGETSLS', INFOT, NOUT, LERR, OK )
+*
 *        CGELSS
 *
          SRNAMT = 'CGELSS'
diff --git a/lapack-netlib/TESTING/LIN/ddrvls.f b/lapack-netlib/TESTING/LIN/ddrvls.f
index b64930c10..b3d07d67f 100644
--- a/lapack-netlib/TESTING/LIN/ddrvls.f
+++ b/lapack-netlib/TESTING/LIN/ddrvls.f
@@ -31,8 +31,8 @@
 *>
 *> \verbatim
 *>
-*> DDRVLS tests the least squares driver routines DGELS, DGETSLS, DGELSS, DGELSY,
-*> and DGELSD.
+*> DDRVLS tests the least squares driver routines DGELS, DGELST,
+*> DGETSLS, DGELSS, DGELSY, and DGELSD.
 *> \endverbatim
 *
 *  Arguments:
@@ -211,7 +211,7 @@
 *
 *     .. Parameters ..
       INTEGER            NTESTS
-      PARAMETER          ( NTESTS = 16 )
+      PARAMETER          ( NTESTS = 18 )
       INTEGER            SMLSIZ
       PARAMETER          ( SMLSIZ = 25 )
       DOUBLE PRECISION   ONE, TWO, ZERO
@@ -225,8 +225,8 @@
      $                   LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS,
      $                   NFAIL, NRHS, NROWS, NRUN, RANK, MB,
      $                   MMAX, NMAX, NSMAX, LIWORK,
-     $                   LWORK_DGELS, LWORK_DGETSLS, LWORK_DGELSS,
-     $                   LWORK_DGELSY, LWORK_DGELSD
+     $                   LWORK_DGELS, LWORK_DGELST, LWORK_DGETSLS,
+     $                   LWORK_DGELSS, LWORK_DGELSY, LWORK_DGELSD
       DOUBLE PRECISION   EPS, NORMA, NORMB, RCOND
 *     ..
 *     .. Local Arrays ..
@@ -243,12 +243,12 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAERH, ALAHD, ALASVM, DAXPY, DERRLS, DGELS,
-     $                   DGELSD, DGELSS, DGELSY, DGEMM, DLACPY,
-     $                   DLARNV, DLASRT, DQRT13, DQRT15, DQRT16, DSCAL,
-     $                   XLAENV
+     $                   DGELSD, DGELSS, DGELST, DGELSY, DGEMM,
+     $                   DGETSLS, DLACPY, DLARNV, DQRT13, DQRT15,
+     $                   DQRT16, DSCAL, XLAENV
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          DBLE, INT, LOG, MAX, MIN, SQRT
+      INTRINSIC          DBLE, INT, MAX, MIN, SQRT
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -330,7 +330,8 @@
       LIWORK = 1
 *
 *     Iterate through all test cases and compute necessary workspace
-*     sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines.
+*     sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD
+*     routines.
 *
       DO IM = 1, NM
          M = MVAL( IM )
@@ -357,6 +358,10 @@
                               CALL DGELS( TRANS, M, N, NRHS, A, LDA,
      $                                    B, LDB, WQ, -1, INFO )
                               LWORK_DGELS = INT ( WQ ( 1 ) )
+*                             Compute workspace needed for DGELST
+                              CALL DGELST( TRANS, M, N, NRHS, A, LDA,
+     $                                    B, LDB, WQ, -1, INFO )
+                              LWORK_DGELST = INT ( WQ ( 1 ) )
 *                             Compute workspace needed for DGETSLS
                               CALL DGETSLS( TRANS, M, N, NRHS, A, LDA,
      $                                      B, LDB, WQ, -1, INFO )
@@ -378,9 +383,9 @@
 *                       Compute LIWORK workspace needed for DGELSY and DGELSD
                         LIWORK = MAX( LIWORK, N, IWQ( 1 ) )
 *                       Compute LWORK workspace needed for all functions
-                        LWORK = MAX( LWORK, LWORK_DGELS, LWORK_DGETSLS,
-     $                               LWORK_DGELSY, LWORK_DGELSS,
-     $                               LWORK_DGELSD )
+                        LWORK = MAX( LWORK, LWORK_DGELS, LWORK_DGELST,
+     $                               LWORK_DGETSLS, LWORK_DGELSY,
+     $                               LWORK_DGELSS, LWORK_DGELSD )
                      END IF
                   ENDDO
                ENDDO
@@ -411,21 +416,26 @@
                      ITYPE = ( IRANK-1 )*3 + ISCALE
                      IF( .NOT.DOTYPE( ITYPE ) )
      $                  GO TO 110
-*
+*                 =====================================================
+*                       Begin test DGELS
+*                 =====================================================
                      IF( IRANK.EQ.1 ) THEN
 *
-*                       Test DGELS
-*
 *                       Generate a matrix of scaling type ISCALE
 *
                         CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
      $                               ISEED )
-                        DO 40 INB = 1, NNB
+*
+*                       Loop for testing different block sizes.
+*
+                        DO INB = 1, NNB
                            NB = NBVAL( INB )
                            CALL XLAENV( 1, NB )
                            CALL XLAENV( 3, NXVAL( INB ) )
 *
-                           DO 30 ITRAN = 1, 2
+*                          Loop for testing non-transposed and transposed.
+*
+                           DO ITRAN = 1, 2
                               IF( ITRAN.EQ.1 ) THEN
                                  TRANS = 'N'
                                  NROWS = M
@@ -469,20 +479,27 @@
      $                                        ITYPE, NFAIL, NERRS,
      $                                        NOUT )
 *
-*                             Check correctness of results
+*                             Test 1: Check correctness of results
+*                             for DGELS, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
 *
-                              LDWORK = MAX( 1, NROWS )
                               IF( NROWS.GT.0 .AND. NRHS.GT.0 )
      $                           CALL DLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, C, LDB )
                               CALL DQRT16( TRANS, M, N, NRHS, COPYA,
      $                                     LDA, B, LDB, C, LDB, WORK,
      $                                     RESULT( 1 ) )
+*
+*                             Test 2: Check correctness of results
+*                             for DGELS.
 *
                               IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
      $                            ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
 *
-*                                Solving LS system
+*                                Solving LS system, compute:
+*                                r = norm((B- A*X)**T * A) /
+*                                 / (norm(A)*norm(B)*max(M,N,NRHS)*EPS)
 *
                                  RESULT( 2 ) = DQRT17( TRANS, 1, M, N,
      $                                         NRHS, COPYA, LDA, B, LDB,
@@ -500,35 +517,42 @@
 *                             Print information about the tests that
 *                             did not pass the threshold.
 *
-                              DO 20 K = 1, 2
+                              DO K = 1, 2
                                  IF( RESULT( K ).GE.THRESH ) THEN
                                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                                 CALL ALAHD( NOUT, PATH )
-                                    WRITE( NOUT, FMT = 9999 )TRANS, M,
+                                    WRITE( NOUT, FMT = 9999 ) TRANS, M,
      $                                 N, NRHS, NB, ITYPE, K,
      $                                 RESULT( K )
                                     NFAIL = NFAIL + 1
                                  END IF
-   20                         CONTINUE
+                              END DO
                               NRUN = NRUN + 2
-   30                      CONTINUE
-   40                   CONTINUE
-*
-*
-*                       Test DGETSLS
+                           END DO
+                        END DO
+                     END IF
+*                 =====================================================
+*                       End test DGELS
+*                 =====================================================
+*                 =====================================================
+*                       Begin test DGELST
+*                 =====================================================
+                     IF( IRANK.EQ.1 ) THEN
 *
 *                       Generate a matrix of scaling type ISCALE
 *
                         CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
      $                               ISEED )
-                        DO 65 INB = 1, NNB
-                           MB = NBVAL( INB )
-                           CALL XLAENV( 1, MB )
-                             DO 62 IMB = 1, NNB
-                              NB = NBVAL( IMB )
-                              CALL XLAENV( 2, NB )
 *
-                           DO 60 ITRAN = 1, 2
+*                       Loop for testing different block sizes.
+*
+                        DO INB = 1, NNB
+                           NB = NBVAL( INB )
+                           CALL XLAENV( 1, NB )
+*
+*                          Loop for testing non-transposed and transposed.
+*
+                           DO ITRAN = 1, 2
                               IF( ITRAN.EQ.1 ) THEN
                                  TRANS = 'N'
                                  NROWS = M
@@ -563,31 +587,38 @@
                                  CALL DLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, B, LDB )
                               END IF
-                              SRNAMT = 'DGETSLS '
-                              CALL DGETSLS( TRANS, M, N, NRHS, A,
-     $                                 LDA, B, LDB, WORK, LWORK, INFO )
+                              SRNAMT = 'DGELST'
+                              CALL DGELST( TRANS, M, N, NRHS, A, LDA, B,
+     $                                     LDB, WORK, LWORK, INFO )
                               IF( INFO.NE.0 )
-     $                           CALL ALAERH( PATH, 'DGETSLS ', INFO, 0,
+     $                           CALL ALAERH( PATH, 'DGELST', INFO, 0,
      $                                        TRANS, M, N, NRHS, -1, NB,
      $                                        ITYPE, NFAIL, NERRS,
      $                                        NOUT )
 *
-*                             Check correctness of results
+*                             Test 3: Check correctness of results
+*                             for DGELST, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
 *
-                              LDWORK = MAX( 1, NROWS )
                               IF( NROWS.GT.0 .AND. NRHS.GT.0 )
      $                           CALL DLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, C, LDB )
                               CALL DQRT16( TRANS, M, N, NRHS, COPYA,
      $                                     LDA, B, LDB, C, LDB, WORK,
-     $                                     RESULT( 15 ) )
+     $                                     RESULT( 3 ) )
+*
+*                             Test 4: Check correctness of results
+*                             for DGELST.
 *
                               IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
      $                            ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
 *
-*                                Solving LS system
+*                                Solving LS system, compute:
+*                                r = norm((B- A*X)**T * A) /
+*                                 / (norm(A)*norm(B)*max(M,N,NRHS)*EPS)
 *
-                                 RESULT( 16 ) = DQRT17( TRANS, 1, M, N,
+                                 RESULT( 4 ) = DQRT17( TRANS, 1, M, N,
      $                                         NRHS, COPYA, LDA, B, LDB,
      $                                         COPYB, LDB, C, WORK,
      $                                         LWORK )
@@ -595,7 +626,7 @@
 *
 *                                Solving overdetermined system
 *
-                                 RESULT( 16 ) = DQRT14( TRANS, M, N,
+                                 RESULT( 4 ) = DQRT14( TRANS, M, N,
      $                                         NRHS, COPYA, LDA, B, LDB,
      $                                         WORK, LWORK )
                               END IF
@@ -603,21 +634,151 @@
 *                             Print information about the tests that
 *                             did not pass the threshold.
 *
-                              DO 50 K = 15, 16
+                              DO K = 3, 4
                                  IF( RESULT( K ).GE.THRESH ) THEN
                                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                                 CALL ALAHD( NOUT, PATH )
-                                    WRITE( NOUT, FMT = 9997 )TRANS, M,
-     $                                 N, NRHS, MB, NB, ITYPE, K,
+                                    WRITE( NOUT, FMT = 9999 ) TRANS, M,
+     $                                 N, NRHS, NB, ITYPE, K,
      $                                 RESULT( K )
                                     NFAIL = NFAIL + 1
                                  END IF
-   50                         CONTINUE
+                              END DO
                               NRUN = NRUN + 2
-   60                      CONTINUE
-   62                      CONTINUE
-   65                   CONTINUE
+                           END DO
+                        END DO
+                     END IF
+*                 =====================================================
+*                       End test DGELST
+*                 =====================================================
+*                 =====================================================
+*                       Begin test DGETSLS
+*                 =====================================================
+                     IF( IRANK.EQ.1 ) THEN
+*
+*                       Generate a matrix of scaling type ISCALE
+*
+                        CALL DQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
+     $                               ISEED )
+*
+*                       Loop for testing different block sizes MB.
+*
+                        DO IMB = 1, NNB
+                           MB = NBVAL( IMB )
+                           CALL XLAENV( 1, MB )
+*
+*                          Loop for testing different block sizes NB.
+*
+                           DO INB = 1, NNB
+                              NB = NBVAL( INB )
+                              CALL XLAENV( 2, NB )
+*
+*                             Loop for testing non-transposed
+*                             and transposed.
+*
+                              DO ITRAN = 1, 2
+                                 IF( ITRAN.EQ.1 ) THEN
+                                    TRANS = 'N'
+                                    NROWS = M
+                                    NCOLS = N
+                                 ELSE
+                                    TRANS = 'T'
+                                    NROWS = N
+                                    NCOLS = M
+                                 END IF
+                                 LDWORK = MAX( 1, NCOLS )
+*
+*                                Set up a consistent rhs
+*
+                                 IF( NCOLS.GT.0 ) THEN
+                                    CALL DLARNV( 2, ISEED, NCOLS*NRHS,
+     $                                           WORK )
+                                    CALL DSCAL( NCOLS*NRHS,
+     $                                          ONE / DBLE( NCOLS ),
+     $                                          WORK, 1 )
+                                 END IF
+                                 CALL DGEMM( TRANS, 'No transpose',
+     $                                       NROWS, NRHS, NCOLS, ONE,
+     $                                       COPYA, LDA, WORK, LDWORK,
+     $                                       ZERO, B, LDB )
+                                 CALL DLACPY( 'Full', NROWS, NRHS,
+     $                                        B, LDB, COPYB, LDB )
+*
+*                                Solve LS or overdetermined system
+*
+                                 IF( M.GT.0 .AND. N.GT.0 ) THEN
+                                    CALL DLACPY( 'Full', M, N,
+     $                                           COPYA, LDA, A, LDA )
+                                    CALL DLACPY( 'Full', NROWS, NRHS,
+     $                                           COPYB, LDB, B, LDB )
+                                 END IF
+                                 SRNAMT = 'DGETSLS'
+                                 CALL DGETSLS( TRANS, M, N, NRHS,
+     $                                    A, LDA, B, LDB, WORK, LWORK,
+     $                                    INFO )
+                                 IF( INFO.NE.0 )
+     $                              CALL ALAERH( PATH, 'DGETSLS', INFO,
+     $                                           0, TRANS, M, N, NRHS,
+     $                                           -1, NB, ITYPE, NFAIL,
+     $                                           NERRS, NOUT )
+*
+*                             Test 5: Check correctness of results
+*                             for DGETSLS, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
+*
+                                 IF( NROWS.GT.0 .AND. NRHS.GT.0 )
+     $                              CALL DLACPY( 'Full', NROWS, NRHS,
+     $                                           COPYB, LDB, C, LDB )
+                                 CALL DQRT16( TRANS, M, N, NRHS,
+     $                                        COPYA, LDA, B, LDB,
+     $                                        C, LDB, WORK,
+     $                                        RESULT( 5 ) )
+*
+*                             Test 6: Check correctness of results
+*                             for DGETSLS.
+*
+                                 IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
+     $                               ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
+*
+*                                   Solving LS system, compute:
+*                                   r = norm((B- A*X)**T * A) /
+*                                 / (norm(A)*norm(B)*max(M,N,NRHS)*EPS)
+*
+                                    RESULT( 6 ) = DQRT17( TRANS, 1, M,
+     $                                             N, NRHS, COPYA, LDA,
+     $                                             B, LDB, COPYB, LDB,
+     $                                             C, WORK, LWORK )
+                                 ELSE
+*
+*                                   Solving overdetermined system
+*
+                                    RESULT( 6 ) = DQRT14( TRANS, M, N,
+     $                                             NRHS, COPYA, LDA,
+     $                                             B, LDB, WORK, LWORK )
+                                 END IF
+*
+*                                Print information about the tests that
+*                                did not pass the threshold.
+*
+                                 DO K = 5, 6
+                                    IF( RESULT( K ).GE.THRESH ) THEN
+                                       IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                                    CALL ALAHD( NOUT, PATH )
+                                       WRITE( NOUT, FMT = 9997 ) TRANS,
+     $                                    M, N, NRHS, MB, NB, ITYPE,
+     $                                    K, RESULT( K )
+                                       NFAIL = NFAIL + 1
+                                    END IF
+                                 END DO
+                                 NRUN = NRUN + 2
+                              END DO
+                           END DO
+                        END DO
                      END IF
+*                 =====================================================
+*                       End test DGETSLS
+*                 =====================================================
 *
 *                    Generate a matrix of scaling type ISCALE and rank
 *                    type IRANK.
@@ -662,37 +823,37 @@
      $                                  N, NRHS, -1, NB, ITYPE, NFAIL,
      $                                  NERRS, NOUT )
 *
-*                       Test 3:  Compute relative error in svd
+*                       Test 7:  Compute relative error in svd
 *                                workspace: M*N + 4*MIN(M,N) + MAX(M,N)
 *
-                        RESULT( 3 ) = DQRT12( CRANK, CRANK, A, LDA,
+                        RESULT( 7 ) = DQRT12( CRANK, CRANK, A, LDA,
      $                                COPYS, WORK, LWORK )
 *
-*                       Test 4:  Compute error in solution
+*                       Test 8:  Compute error in solution
 *                                workspace:  M*NRHS + M
 *
                         CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL DQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK,
-     $                               WORK( M*NRHS+1 ), RESULT( 4 ) )
+     $                               WORK( M*NRHS+1 ), RESULT( 8 ) )
 *
-*                       Test 5:  Check norm of r'*A
+*                       Test 9:  Check norm of r'*A
 *                                workspace: NRHS*(M+N)
 *
-                        RESULT( 5 ) = ZERO
+                        RESULT( 9 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 5 ) = DQRT17( 'No transpose', 1, M,
+     $                     RESULT( 9 ) = DQRT17( 'No transpose', 1, M,
      $                                   N, NRHS, COPYA, LDA, B, LDB,
      $                                   COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 6:  Check if x is in the rowspace of A
+*                       Test 10:  Check if x is in the rowspace of A
 *                                workspace: (M+NRHS)*(N+2)
 *
-                        RESULT( 6 ) = ZERO
+                        RESULT( 10 ) = ZERO
 *
                         IF( N.GT.CRANK )
-     $                     RESULT( 6 ) = DQRT14( 'No transpose', M, N,
+     $                     RESULT( 10 ) = DQRT14( 'No transpose', M, N,
      $                                   NRHS, COPYA, LDA, B, LDB,
      $                                   WORK, LWORK )
 *
@@ -716,38 +877,38 @@
 *                       workspace used: 3*min(m,n) +
 *                                       max(2*min(m,n),nrhs,max(m,n))
 *
-*                       Test 7:  Compute relative error in svd
+*                       Test 11:  Compute relative error in svd
 *
                         IF( RANK.GT.0 ) THEN
                            CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 )
-                           RESULT( 7 ) = DASUM( MNMIN, S, 1 ) /
+                           RESULT( 11 ) = DASUM( MNMIN, S, 1 ) /
      $                                   DASUM( MNMIN, COPYS, 1 ) /
      $                                   ( EPS*DBLE( MNMIN ) )
                         ELSE
-                           RESULT( 7 ) = ZERO
+                           RESULT( 11 ) = ZERO
                         END IF
 *
-*                       Test 8:  Compute error in solution
+*                       Test 12:  Compute error in solution
 *
                         CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL DQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK,
-     $                               WORK( M*NRHS+1 ), RESULT( 8 ) )
+     $                               WORK( M*NRHS+1 ), RESULT( 12 ) )
 *
-*                       Test 9:  Check norm of r'*A
+*                       Test 13:  Check norm of r'*A
 *
-                        RESULT( 9 ) = ZERO
+                        RESULT( 13 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 9 ) = DQRT17( 'No transpose', 1, M,
+     $                     RESULT( 13 ) = DQRT17( 'No transpose', 1, M,
      $                                   N, NRHS, COPYA, LDA, B, LDB,
      $                                   COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 10:  Check if x is in the rowspace of A
+*                       Test 14:  Check if x is in the rowspace of A
 *
-                        RESULT( 10 ) = ZERO
+                        RESULT( 14 ) = ZERO
                         IF( N.GT.CRANK )
-     $                     RESULT( 10 ) = DQRT14( 'No transpose', M, N,
+     $                     RESULT( 14 ) = DQRT14( 'No transpose', M, N,
      $                                    NRHS, COPYA, LDA, B, LDB,
      $                                    WORK, LWORK )
 *
@@ -776,45 +937,45 @@
      $                                  N, NRHS, -1, NB, ITYPE, NFAIL,
      $                                  NERRS, NOUT )
 *
-*                       Test 11:  Compute relative error in svd
+*                       Test 15:  Compute relative error in svd
 *
                         IF( RANK.GT.0 ) THEN
                            CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 )
-                           RESULT( 11 ) = DASUM( MNMIN, S, 1 ) /
+                           RESULT( 15 ) = DASUM( MNMIN, S, 1 ) /
      $                                    DASUM( MNMIN, COPYS, 1 ) /
      $                                    ( EPS*DBLE( MNMIN ) )
                         ELSE
-                           RESULT( 11 ) = ZERO
+                           RESULT( 15 ) = ZERO
                         END IF
 *
-*                       Test 12:  Compute error in solution
+*                       Test 16:  Compute error in solution
 *
                         CALL DLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL DQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK,
-     $                               WORK( M*NRHS+1 ), RESULT( 12 ) )
+     $                               WORK( M*NRHS+1 ), RESULT( 16 ) )
 *
-*                       Test 13:  Check norm of r'*A
+*                       Test 17:  Check norm of r'*A
 *
-                        RESULT( 13 ) = ZERO
+                        RESULT( 17 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 13 ) = DQRT17( 'No transpose', 1, M,
+     $                     RESULT( 17 ) = DQRT17( 'No transpose', 1, M,
      $                                    N, NRHS, COPYA, LDA, B, LDB,
      $                                    COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 14:  Check if x is in the rowspace of A
+*                       Test 18:  Check if x is in the rowspace of A
 *
-                        RESULT( 14 ) = ZERO
+                        RESULT( 18 ) = ZERO
                         IF( N.GT.CRANK )
-     $                     RESULT( 14 ) = DQRT14( 'No transpose', M, N,
+     $                     RESULT( 18 ) = DQRT14( 'No transpose', M, N,
      $                                    NRHS, COPYA, LDA, B, LDB,
      $                                    WORK, LWORK )
 *
 *                       Print information about the tests that did not
 *                       pass the threshold.
 *
-                        DO 90 K = 3, 14
+                        DO 90 K = 7, 18
                            IF( RESULT( K ).GE.THRESH ) THEN
                               IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                           CALL ALAHD( NOUT, PATH )
@@ -826,6 +987,12 @@
                         NRUN = NRUN + 12
 *
   100                CONTINUE
+
+
+
+
+
+
   110             CONTINUE
   120          CONTINUE
   130       CONTINUE
diff --git a/lapack-netlib/TESTING/LIN/derrls.f b/lapack-netlib/TESTING/LIN/derrls.f
index a1f74dec2..09d745238 100644
--- a/lapack-netlib/TESTING/LIN/derrls.f
+++ b/lapack-netlib/TESTING/LIN/derrls.f
@@ -22,7 +22,7 @@
 *> \verbatim
 *>
 *> DERRLS tests the error exits for the DOUBLE PRECISION least squares
-*> driver routines (DGELS, SGELSS, SGELSY, SGELSD).
+*> driver routines (DGELS, DGELST, DGETSLS, SGELSS, SGELSY, SGELSD).
 *> \endverbatim
 *
 *  Arguments:
@@ -83,7 +83,8 @@
       EXTERNAL           LSAMEN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, DGELS, DGELSD, DGELSS, DGELSY
+      EXTERNAL           ALAESM, CHKXER, DGELS, DGELSD, DGELSS, DGELST,
+     $                   DGELSY, DGETSLS
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -130,10 +131,66 @@
          INFOT = 8
          CALL DGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
          CALL CHKXER( 'DGELS ', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'DGELS', INFOT, NOUT, LERR, OK )
          INFOT = 10
          CALL DGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO )
          CALL CHKXER( 'DGELS ', INFOT, NOUT, LERR, OK )
 *
+*        DGELST
+*
+         SRNAMT = 'DGELST'
+         INFOT = 1
+         CALL DGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL DGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO )
+         CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
+         CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL DGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DGELST', INFOT, NOUT, LERR, OK )
+*
+*        DGETSLS
+*
+         SRNAMT = 'DGETSLS'
+         INFOT = 1
+         CALL DGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL DGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO )
+         CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
+         CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'DGETSLS', INFOT, NOUT, LERR, OK )
+*
 *        DGELSS
 *
          SRNAMT = 'DGELSS'
diff --git a/lapack-netlib/TESTING/LIN/sdrvls.f b/lapack-netlib/TESTING/LIN/sdrvls.f
index b96451503..2baf9a3fb 100644
--- a/lapack-netlib/TESTING/LIN/sdrvls.f
+++ b/lapack-netlib/TESTING/LIN/sdrvls.f
@@ -31,8 +31,8 @@
 *>
 *> \verbatim
 *>
-*> SDRVLS tests the least squares driver routines SGELS, SGETSLS, SGELSS, SGELSY,
-*> and SGELSD.
+*> SDRVLS tests the least squares driver routines SGELS, SGELST,
+*> SGETSLS, SGELSS, SGELSY and SGELSD.
 *> \endverbatim
 *
 *  Arguments:
@@ -211,7 +211,7 @@
 *
 *     .. Parameters ..
       INTEGER            NTESTS
-      PARAMETER          ( NTESTS = 16 )
+      PARAMETER          ( NTESTS = 18 )
       INTEGER            SMLSIZ
       PARAMETER          ( SMLSIZ = 25 )
       REAL               ONE, TWO, ZERO
@@ -225,8 +225,8 @@
      $                   LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS,
      $                   NFAIL, NRHS, NROWS, NRUN, RANK, MB,
      $                   MMAX, NMAX, NSMAX, LIWORK,
-     $                   LWORK_SGELS, LWORK_SGETSLS, LWORK_SGELSS,
-     $                   LWORK_SGELSY, LWORK_SGELSD
+     $                   LWORK_SGELS, LWORK_SGELST, LWORK_SGETSLS,
+     $                   LWORK_SGELSS, LWORK_SGELSY, LWORK_SGELSD
       REAL               EPS, NORMA, NORMB, RCOND
 *     ..
 *     .. Local Arrays ..
@@ -243,12 +243,12 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAERH, ALAHD, ALASVM, SAXPY, SERRLS, SGELS,
-     $                   SGELSD, SGELSS, SGELSY, SGEMM, SLACPY,
-     $                   SLARNV, SQRT13, SQRT15, SQRT16, SSCAL,
-     $                   XLAENV, SGETSLS
+     $                   SGELSD, SGELSS, SGELST, SGELSY, SGEMM,
+     $                   SGETSLS, SLACPY, SLARNV, SQRT13, SQRT15,
+     $                   SQRT16, SSCAL, XLAENV
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          INT, LOG, MAX, MIN, REAL, SQRT
+      INTRINSIC          INT, MAX, MIN, REAL, SQRT
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -330,7 +330,8 @@
       LIWORK = 1
 *
 *     Iterate through all test cases and compute necessary workspace
-*     sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines.
+*     sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD
+*     routines.
 *
       DO IM = 1, NM
          M = MVAL( IM )
@@ -357,6 +358,10 @@
                               CALL SGELS( TRANS, M, N, NRHS, A, LDA,
      $                                    B, LDB, WQ( 1 ), -1, INFO )
                               LWORK_SGELS = INT ( WQ( 1 ) )
+*                             Compute workspace needed for SGELST
+                              CALL SGELST( TRANS, M, N, NRHS, A, LDA,
+     $                                    B, LDB, WQ, -1, INFO )
+                              LWORK_SGELST = INT ( WQ ( 1 ) )
 *                             Compute workspace needed for SGETSLS
                               CALL SGETSLS( TRANS, M, N, NRHS, A, LDA,
      $                                      B, LDB, WQ( 1 ), -1, INFO )
@@ -378,9 +383,9 @@
 *                       Compute LIWORK workspace needed for SGELSY and SGELSD
                         LIWORK = MAX( LIWORK, N, IWQ( 1 ) )
 *                       Compute LWORK workspace needed for all functions
-                        LWORK = MAX( LWORK, LWORK_SGELS, LWORK_SGETSLS,
-     $                               LWORK_SGELSY, LWORK_SGELSS,
-     $                               LWORK_SGELSD )
+                        LWORK = MAX( LWORK, LWORK_SGELS, LWORK_SGELST,
+     $                               LWORK_SGETSLS, LWORK_SGELSY,
+     $                               LWORK_SGELSS, LWORK_SGELSD )
                      END IF
                   ENDDO
                ENDDO
@@ -411,21 +416,26 @@
                      ITYPE = ( IRANK-1 )*3 + ISCALE
                      IF( .NOT.DOTYPE( ITYPE ) )
      $                  GO TO 110
-*
+*                 =====================================================
+*                       Begin test SGELS
+*                 =====================================================
                      IF( IRANK.EQ.1 ) THEN
 *
-*                       Test SGELS
-*
 *                       Generate a matrix of scaling type ISCALE
 *
                         CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
      $                               ISEED )
-                        DO 40 INB = 1, NNB
+*
+*                       Loop for testing different block sizes.
+*
+                        DO INB = 1, NNB
                            NB = NBVAL( INB )
                            CALL XLAENV( 1, NB )
                            CALL XLAENV( 3, NXVAL( INB ) )
 *
-                           DO 30 ITRAN = 1, 2
+*                          Loop for testing non-transposed and transposed.
+*
+                           DO ITRAN = 1, 2
                               IF( ITRAN.EQ.1 ) THEN
                                  TRANS = 'N'
                                  NROWS = M
@@ -469,20 +479,27 @@
      $                                        ITYPE, NFAIL, NERRS,
      $                                        NOUT )
 *
-*                             Check correctness of results
+*                             Test 1: Check correctness of results
+*                             for SGELS, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
 *
-                              LDWORK = MAX( 1, NROWS )
                               IF( NROWS.GT.0 .AND. NRHS.GT.0 )
      $                           CALL SLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, C, LDB )
                               CALL SQRT16( TRANS, M, N, NRHS, COPYA,
      $                                     LDA, B, LDB, C, LDB, WORK,
      $                                     RESULT( 1 ) )
+*
+*                             Test 2: Check correctness of results
+*                             for SGELS.
 *
                               IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
      $                            ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
 *
-*                                Solving LS system
+*                                Solving LS system, compute:
+*                                r = norm((B- A*X)**T * A) /
+*                                 / (norm(A)*norm(B)*max(M,N,NRHS)*EPS)
 *
                                  RESULT( 2 ) = SQRT17( TRANS, 1, M, N,
      $                                         NRHS, COPYA, LDA, B, LDB,
@@ -500,7 +517,7 @@
 *                             Print information about the tests that
 *                             did not pass the threshold.
 *
-                              DO 20 K = 1, 2
+                              DO K = 1, 2
                                  IF( RESULT( K ).GE.THRESH ) THEN
                                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                                 CALL ALAHD( NOUT, PATH )
@@ -509,26 +526,33 @@
      $                                 RESULT( K )
                                     NFAIL = NFAIL + 1
                                  END IF
-   20                         CONTINUE
+                              END DO
                               NRUN = NRUN + 2
-   30                      CONTINUE
-   40                   CONTINUE
-*
-*
-*                       Test SGETSLS
+                           END DO
+                        END DO
+                     END IF
+*                 =====================================================
+*                       End test SGELS
+*                 =====================================================
+*                 =====================================================
+*                       Begin test SGELST
+*                 =====================================================
+                     IF( IRANK.EQ.1 ) THEN
 *
 *                       Generate a matrix of scaling type ISCALE
 *
                         CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
      $                               ISEED )
-                        DO 65 INB = 1, NNB
-                             MB = NBVAL( INB )
-                             CALL XLAENV( 1, MB )
-                             DO 62 IMB = 1, NNB
-                               NB = NBVAL( IMB )
-                               CALL XLAENV( 2, NB )
-*
-                           DO 60 ITRAN = 1, 2
+*
+*                       Loop for testing different block sizes.
+*
+                        DO INB = 1, NNB
+                           NB = NBVAL( INB )
+                           CALL XLAENV( 1, NB )
+*
+*                          Loop for testing non-transposed and transposed.
+*
+                           DO ITRAN = 1, 2
                               IF( ITRAN.EQ.1 ) THEN
                                  TRANS = 'N'
                                  NROWS = M
@@ -563,31 +587,38 @@
                                  CALL SLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, B, LDB )
                               END IF
-                              SRNAMT = 'SGETSLS '
-                              CALL SGETSLS( TRANS, M, N, NRHS, A,
-     $                                 LDA, B, LDB, WORK, LWORK, INFO )
+                              SRNAMT = 'SGELST'
+                              CALL SGELST( TRANS, M, N, NRHS, A, LDA, B,
+     $                                     LDB, WORK, LWORK, INFO )
                               IF( INFO.NE.0 )
-     $                           CALL ALAERH( PATH, 'SGETSLS ', INFO, 0,
+     $                           CALL ALAERH( PATH, 'SGELST', INFO, 0,
      $                                        TRANS, M, N, NRHS, -1, NB,
      $                                        ITYPE, NFAIL, NERRS,
      $                                        NOUT )
 *
-*                             Check correctness of results
+*                             Test 3: Check correctness of results
+*                             for SGELST, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
 *
-                              LDWORK = MAX( 1, NROWS )
                               IF( NROWS.GT.0 .AND. NRHS.GT.0 )
      $                           CALL SLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, C, LDB )
                               CALL SQRT16( TRANS, M, N, NRHS, COPYA,
      $                                     LDA, B, LDB, C, LDB, WORK,
-     $                                     RESULT( 15 ) )
+     $                                     RESULT( 3 ) )
+*
+*                             Test 4: Check correctness of results
+*                             for SGELST.
 *
                               IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
      $                            ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
 *
-*                                Solving LS system
+*                                Solving LS system, compute:
+*                                r = norm((B- A*X)**T * A) /
+*                                 / (norm(A)*norm(B)*max(M,N,NRHS)*EPS)
 *
-                                 RESULT( 16 ) = SQRT17( TRANS, 1, M, N,
+                                 RESULT( 4 ) = SQRT17( TRANS, 1, M, N,
      $                                         NRHS, COPYA, LDA, B, LDB,
      $                                         COPYB, LDB, C, WORK,
      $                                         LWORK )
@@ -595,7 +626,7 @@
 *
 *                                Solving overdetermined system
 *
-                                 RESULT( 16 ) = SQRT14( TRANS, M, N,
+                                 RESULT( 4 ) = SQRT14( TRANS, M, N,
      $                                         NRHS, COPYA, LDA, B, LDB,
      $                                         WORK, LWORK )
                               END IF
@@ -603,21 +634,151 @@
 *                             Print information about the tests that
 *                             did not pass the threshold.
 *
-                              DO 50 K = 15, 16
+                              DO K = 3, 4
                                  IF( RESULT( K ).GE.THRESH ) THEN
                                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                                 CALL ALAHD( NOUT, PATH )
-                                    WRITE( NOUT, FMT = 9997 )TRANS, M,
-     $                                 N, NRHS, MB, NB, ITYPE, K,
+                                    WRITE( NOUT, FMT = 9999 ) TRANS, M,
+     $                                 N, NRHS, NB, ITYPE, K,
      $                                 RESULT( K )
                                     NFAIL = NFAIL + 1
                                  END IF
-   50                         CONTINUE
+                              END DO
                               NRUN = NRUN + 2
-   60                      CONTINUE
-   62                      CONTINUE
-   65                   CONTINUE
+                           END DO
+                        END DO
                      END IF
+*                 =====================================================
+*                       End test SGELST
+*                 =====================================================
+*                 =====================================================
+*                       Begin test SGETSLS
+*                 =====================================================
+                     IF( IRANK.EQ.1 ) THEN
+*
+*                       Generate a matrix of scaling type ISCALE
+*
+                        CALL SQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
+     $                               ISEED )
+*
+*                       Loop for testing different block sizes MB.
+*
+                        DO IMB = 1, NNB
+                           MB = NBVAL( IMB )
+                           CALL XLAENV( 1, MB )
+*
+*                          Loop for testing different block sizes NB.
+*
+                           DO INB = 1, NNB
+                              NB = NBVAL( INB )
+                              CALL XLAENV( 2, NB )
+*
+*                             Loop for testing non-transposed
+*                             and transposed.
+*
+                              DO ITRAN = 1, 2
+                                 IF( ITRAN.EQ.1 ) THEN
+                                    TRANS = 'N'
+                                    NROWS = M
+                                    NCOLS = N
+                                 ELSE
+                                    TRANS = 'T'
+                                    NROWS = N
+                                    NCOLS = M
+                                 END IF
+                                 LDWORK = MAX( 1, NCOLS )
+*
+*                                Set up a consistent rhs
+*
+                                 IF( NCOLS.GT.0 ) THEN
+                                    CALL SLARNV( 2, ISEED, NCOLS*NRHS,
+     $                                           WORK )
+                                    CALL SSCAL( NCOLS*NRHS,
+     $                                          ONE / REAL( NCOLS ),
+     $                                          WORK, 1 )
+                                 END IF
+                                 CALL SGEMM( TRANS, 'No transpose',
+     $                                       NROWS, NRHS, NCOLS, ONE,
+     $                                       COPYA, LDA, WORK, LDWORK,
+     $                                       ZERO, B, LDB )
+                                 CALL SLACPY( 'Full', NROWS, NRHS,
+     $                                        B, LDB, COPYB, LDB )
+*
+*                                Solve LS or overdetermined system
+*
+                                 IF( M.GT.0 .AND. N.GT.0 ) THEN
+                                    CALL SLACPY( 'Full', M, N,
+     $                                           COPYA, LDA, A, LDA )
+                                    CALL SLACPY( 'Full', NROWS, NRHS,
+     $                                           COPYB, LDB, B, LDB )
+                                 END IF
+                                 SRNAMT = 'SGETSLS'
+                                 CALL SGETSLS( TRANS, M, N, NRHS,
+     $                                    A, LDA, B, LDB, WORK, LWORK,
+     $                                    INFO )
+                                 IF( INFO.NE.0 )
+     $                              CALL ALAERH( PATH, 'SGETSLS', INFO,
+     $                                           0, TRANS, M, N, NRHS,
+     $                                           -1, NB, ITYPE, NFAIL,
+     $                                           NERRS, NOUT )
+*
+*                             Test 5: Check correctness of results
+*                             for SGETSLS, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
+*
+                                 IF( NROWS.GT.0 .AND. NRHS.GT.0 )
+     $                              CALL SLACPY( 'Full', NROWS, NRHS,
+     $                                           COPYB, LDB, C, LDB )
+                                 CALL SQRT16( TRANS, M, N, NRHS,
+     $                                        COPYA, LDA, B, LDB,
+     $                                        C, LDB, WORK,
+     $                                        RESULT( 5 ) )
+*
+*                             Test 6: Check correctness of results
+*                             for SGETSLS.
+*
+                                 IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
+     $                               ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
+*
+*                                   Solving LS system, compute:
+*                                   r = norm((B- A*X)**T * A) /
+*                                 / (norm(A)*norm(B)*max(M,N,NRHS)*EPS)
+*
+                                    RESULT( 6 ) = SQRT17( TRANS, 1, M,
+     $                                             N, NRHS, COPYA, LDA,
+     $                                             B, LDB, COPYB, LDB,
+     $                                             C, WORK, LWORK )
+                                 ELSE
+*
+*                                   Solving overdetermined system
+*
+                                    RESULT( 6 ) = SQRT14( TRANS, M, N,
+     $                                             NRHS, COPYA, LDA,
+     $                                             B, LDB, WORK, LWORK )
+                                 END IF
+*
+*                                Print information about the tests that
+*                                did not pass the threshold.
+*
+                                 DO K = 5, 6
+                                    IF( RESULT( K ).GE.THRESH ) THEN
+                                       IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                                    CALL ALAHD( NOUT, PATH )
+                                       WRITE( NOUT, FMT = 9997 ) TRANS,
+     $                                    M, N, NRHS, MB, NB, ITYPE,
+     $                                    K, RESULT( K )
+                                       NFAIL = NFAIL + 1
+                                    END IF
+                                 END DO
+                                 NRUN = NRUN + 2
+                              END DO
+                           END DO
+                        END DO
+                     END IF
+*                 =====================================================
+*                       End test SGETSLS
+*                 =====================================================
 *
 *                    Generate a matrix of scaling type ISCALE and rank
 *                    type IRANK.
@@ -662,37 +823,37 @@
      $                                  N, NRHS, -1, NB, ITYPE, NFAIL,
      $                                  NERRS, NOUT )
 *
-*                       Test 3:  Compute relative error in svd
+*                       Test 7:  Compute relative error in svd
 *                                workspace: M*N + 4*MIN(M,N) + MAX(M,N)
 *
-                        RESULT( 3 ) = SQRT12( CRANK, CRANK, A, LDA,
+                        RESULT( 7 ) = SQRT12( CRANK, CRANK, A, LDA,
      $                                COPYS, WORK, LWORK )
 *
-*                       Test 4:  Compute error in solution
+*                       Test 8:  Compute error in solution
 *                                workspace:  M*NRHS + M
 *
                         CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL SQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK,
-     $                               WORK( M*NRHS+1 ), RESULT( 4 ) )
+     $                               WORK( M*NRHS+1 ), RESULT( 8 ) )
 *
-*                       Test 5:  Check norm of r'*A
+*                       Test 9:  Check norm of r'*A
 *                                workspace: NRHS*(M+N)
 *
-                        RESULT( 5 ) = ZERO
+                        RESULT( 9 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 5 ) = SQRT17( 'No transpose', 1, M,
+     $                     RESULT( 9 ) = SQRT17( 'No transpose', 1, M,
      $                                   N, NRHS, COPYA, LDA, B, LDB,
      $                                   COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 6:  Check if x is in the rowspace of A
+*                       Test 10:  Check if x is in the rowspace of A
 *                                workspace: (M+NRHS)*(N+2)
 *
-                        RESULT( 6 ) = ZERO
+                        RESULT( 10 ) = ZERO
 *
                         IF( N.GT.CRANK )
-     $                     RESULT( 6 ) = SQRT14( 'No transpose', M, N,
+     $                     RESULT( 10 ) = SQRT14( 'No transpose', M, N,
      $                                   NRHS, COPYA, LDA, B, LDB,
      $                                   WORK, LWORK )
 *
@@ -716,38 +877,38 @@
 *                       workspace used: 3*min(m,n) +
 *                                       max(2*min(m,n),nrhs,max(m,n))
 *
-*                       Test 7:  Compute relative error in svd
+*                       Test 11:  Compute relative error in svd
 *
                         IF( RANK.GT.0 ) THEN
                            CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 )
-                           RESULT( 7 ) = SASUM( MNMIN, S, 1 ) /
+                           RESULT( 11 ) = SASUM( MNMIN, S, 1 ) /
      $                                   SASUM( MNMIN, COPYS, 1 ) /
      $                                   ( EPS*REAL( MNMIN ) )
                         ELSE
-                           RESULT( 7 ) = ZERO
+                           RESULT( 11 ) = ZERO
                         END IF
 *
-*                       Test 8:  Compute error in solution
+*                       Test 12:  Compute error in solution
 *
                         CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL SQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK,
-     $                               WORK( M*NRHS+1 ), RESULT( 8 ) )
+     $                               WORK( M*NRHS+1 ), RESULT( 12 ) )
 *
-*                       Test 9:  Check norm of r'*A
+*                       Test 13:  Check norm of r'*A
 *
-                        RESULT( 9 ) = ZERO
+                        RESULT( 13 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 9 ) = SQRT17( 'No transpose', 1, M,
+     $                     RESULT( 13 ) = SQRT17( 'No transpose', 1, M,
      $                                   N, NRHS, COPYA, LDA, B, LDB,
      $                                   COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 10:  Check if x is in the rowspace of A
+*                       Test 14:  Check if x is in the rowspace of A
 *
-                        RESULT( 10 ) = ZERO
+                        RESULT( 14 ) = ZERO
                         IF( N.GT.CRANK )
-     $                     RESULT( 10 ) = SQRT14( 'No transpose', M, N,
+     $                     RESULT( 14 ) = SQRT14( 'No transpose', M, N,
      $                                    NRHS, COPYA, LDA, B, LDB,
      $                                    WORK, LWORK )
 *
@@ -776,45 +937,45 @@
      $                                  N, NRHS, -1, NB, ITYPE, NFAIL,
      $                                  NERRS, NOUT )
 *
-*                       Test 11:  Compute relative error in svd
+*                       Test 15:  Compute relative error in svd
 *
                         IF( RANK.GT.0 ) THEN
                            CALL SAXPY( MNMIN, -ONE, COPYS, 1, S, 1 )
-                           RESULT( 11 ) = SASUM( MNMIN, S, 1 ) /
+                           RESULT( 15 ) = SASUM( MNMIN, S, 1 ) /
      $                                    SASUM( MNMIN, COPYS, 1 ) /
      $                                    ( EPS*REAL( MNMIN ) )
                         ELSE
-                           RESULT( 11 ) = ZERO
+                           RESULT( 15 ) = ZERO
                         END IF
 *
-*                       Test 12:  Compute error in solution
+*                       Test 16:  Compute error in solution
 *
                         CALL SLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL SQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK,
-     $                               WORK( M*NRHS+1 ), RESULT( 12 ) )
+     $                               WORK( M*NRHS+1 ), RESULT( 16 ) )
 *
-*                       Test 13:  Check norm of r'*A
+*                       Test 17:  Check norm of r'*A
 *
-                        RESULT( 13 ) = ZERO
+                        RESULT( 17 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 13 ) = SQRT17( 'No transpose', 1, M,
+     $                     RESULT( 17 ) = SQRT17( 'No transpose', 1, M,
      $                                    N, NRHS, COPYA, LDA, B, LDB,
      $                                    COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 14:  Check if x is in the rowspace of A
+*                       Test 18:  Check if x is in the rowspace of A
 *
-                        RESULT( 14 ) = ZERO
+                        RESULT( 18 ) = ZERO
                         IF( N.GT.CRANK )
-     $                     RESULT( 14 ) = SQRT14( 'No transpose', M, N,
+     $                     RESULT( 18 ) = SQRT14( 'No transpose', M, N,
      $                                    NRHS, COPYA, LDA, B, LDB,
      $                                    WORK, LWORK )
 *
 *                       Print information about the tests that did not
 *                       pass the threshold.
 *
-                        DO 90 K = 3, 14
+                        DO 90 K = 7, 18
                            IF( RESULT( K ).GE.THRESH ) THEN
                               IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                           CALL ALAHD( NOUT, PATH )
diff --git a/lapack-netlib/TESTING/LIN/serrls.f b/lapack-netlib/TESTING/LIN/serrls.f
index e6ee4360f..6c4820066 100644
--- a/lapack-netlib/TESTING/LIN/serrls.f
+++ b/lapack-netlib/TESTING/LIN/serrls.f
@@ -22,7 +22,7 @@
 *> \verbatim
 *>
 *> SERRLS tests the error exits for the REAL least squares
-*> driver routines (SGELS, SGELSS, SGELSY, SGELSD).
+*> driver routines (SGELS, SGELST, SGETSLS, SGELSS, SGELSY, SGELSD).
 *> \endverbatim
 *
 *  Arguments:
@@ -83,7 +83,8 @@
       EXTERNAL           LSAMEN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, SGELS, SGELSD, SGELSS, SGELSY
+      EXTERNAL           ALAESM, CHKXER, SGELS, SGELSD, SGELSS, SGELST,
+     $                   SGELSY, SGETSLS
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -130,10 +131,66 @@
          INFOT = 8
          CALL SGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
          CALL CHKXER( 'SGELS ', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL SGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'DGELS', INFOT, NOUT, LERR, OK )
          INFOT = 10
          CALL SGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO )
          CALL CHKXER( 'SGELS ', INFOT, NOUT, LERR, OK )
 *
+*        SGELST
+*
+         SRNAMT = 'SGELST'
+         INFOT = 1
+         CALL SGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL SGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL SGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO )
+         CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL SGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
+         CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL SGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL SGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'SGELST', INFOT, NOUT, LERR, OK )
+*
+*        SGETSLS
+*
+         SRNAMT = 'SGETSLS'
+         INFOT = 1
+         CALL SGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL SGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL SGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO )
+         CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL SGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
+         CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL SGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'SGETSLS', INFOT, NOUT, LERR, OK )
+*
 *        SGELSS
 *
          SRNAMT = 'SGELSS'
diff --git a/lapack-netlib/TESTING/LIN/zdrvls.f b/lapack-netlib/TESTING/LIN/zdrvls.f
index 2eab97905..b21345d30 100644
--- a/lapack-netlib/TESTING/LIN/zdrvls.f
+++ b/lapack-netlib/TESTING/LIN/zdrvls.f
@@ -31,8 +31,8 @@
 *>
 *> \verbatim
 *>
-*> ZDRVLS tests the least squares driver routines ZGELS, ZGETSLS, ZGELSS, ZGELSY
-*> and ZGELSD.
+*> ZDRVLS tests the least squares driver routines ZGELS, ZGELST,
+*> ZGETSLS, ZGELSS, ZGELSY and ZGELSD.
 *> \endverbatim
 *
 *  Arguments:
@@ -211,7 +211,7 @@
 *
 *     .. Parameters ..
       INTEGER            NTESTS
-      PARAMETER          ( NTESTS = 16 )
+      PARAMETER          ( NTESTS = 18 )
       INTEGER            SMLSIZ
       PARAMETER          ( SMLSIZ = 25 )
       DOUBLE PRECISION   ONE, ZERO
@@ -228,8 +228,8 @@
      $                   LWLSY, LWORK, M, MNMIN, N, NB, NCOLS, NERRS,
      $                   NFAIL, NRHS, NROWS, NRUN, RANK, MB,
      $                   MMAX, NMAX, NSMAX, LIWORK, LRWORK,
-     $                   LWORK_ZGELS, LWORK_ZGETSLS, LWORK_ZGELSS,
-     $                   LWORK_ZGELSY, LWORK_ZGELSD,
+     $                   LWORK_ZGELS, LWORK_ZGELST, LWORK_ZGETSLS,
+     $                   LWORK_ZGELSS, LWORK_ZGELSY, LWORK_ZGELSD,
      $                   LRWORK_ZGELSY, LRWORK_ZGELSS, LRWORK_ZGELSD
       DOUBLE PRECISION   EPS, NORMA, NORMB, RCOND
 *     ..
@@ -248,10 +248,10 @@
       EXTERNAL           DASUM, DLAMCH, ZQRT12, ZQRT14, ZQRT17
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASVM, DAXPY, DLASRT, XLAENV,
-     $                   ZDSCAL, ZERRLS, ZGELS, ZGELSD, ZGELSS,
-     $                   ZGELSY, ZGEMM, ZLACPY, ZLARNV, ZQRT13, ZQRT15,
-     $                   ZQRT16, ZGETSLS
+      EXTERNAL           ALAERH, ALAHD, ALASVM, DAXPY, ZERRLS, ZGELS,
+     $                   ZGELSD, ZGELSS, ZGELST, ZGELSY, ZGEMM,
+     $                   ZGETSLS, ZLACPY, ZLARNV, ZQRT13, ZQRT15,
+     $                   ZQRT16, ZDSCAL, XLAENV
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE, MAX, MIN, INT, SQRT
@@ -334,7 +334,8 @@
       LIWORK = 1
 *
 *     Iterate through all test cases and compute necessary workspace
-*     sizes for ?GELS, ?GETSLS, ?GELSY, ?GELSS and ?GELSD routines.
+*     sizes for ?GELS, ?GELST, ?GETSLS, ?GELSY, ?GELSS and ?GELSD
+*     routines.
 *
       DO IM = 1, NM
          M = MVAL( IM )
@@ -361,6 +362,10 @@
                               CALL ZGELS( TRANS, M, N, NRHS, A, LDA,
      $                                    B, LDB, WQ, -1, INFO )
                               LWORK_ZGELS = INT ( WQ( 1 ) )
+*                             Compute workspace needed for ZGELST
+                              CALL ZGELST( TRANS, M, N, NRHS, A, LDA,
+     $                                    B, LDB, WQ, -1, INFO )
+                              LWORK_ZGELST = INT ( WQ ( 1 ) )
 *                             Compute workspace needed for ZGETSLS
                               CALL ZGETSLS( TRANS, M, N, NRHS, A, LDA,
      $                                      B, LDB, WQ, -1, INFO )
@@ -390,9 +395,9 @@
                         LRWORK = MAX( LRWORK, LRWORK_ZGELSY,
      $                                LRWORK_ZGELSS, LRWORK_ZGELSD )
 *                       Compute LWORK workspace needed for all functions
-                        LWORK = MAX( LWORK, LWORK_ZGELS, LWORK_ZGETSLS,
-     $                               LWORK_ZGELSY, LWORK_ZGELSS,
-     $                               LWORK_ZGELSD )
+                        LWORK = MAX( LWORK, LWORK_ZGELS, LWORK_ZGELST,
+     $                               LWORK_ZGETSLS, LWORK_ZGELSY,
+     $                               LWORK_ZGELSS, LWORK_ZGELSD )
                      END IF
                   ENDDO
                ENDDO
@@ -425,21 +430,26 @@
                      ITYPE = ( IRANK-1 )*3 + ISCALE
                      IF( .NOT.DOTYPE( ITYPE ) )
      $                  GO TO 100
-*
+*                 =====================================================
+*                       Begin test ZGELS
+*                 =====================================================
                      IF( IRANK.EQ.1 ) THEN
 *
-*                       Test ZGELS
-*
 *                       Generate a matrix of scaling type ISCALE
 *
                         CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
      $                               ISEED )
-                        DO 40 INB = 1, NNB
+*
+*                       Loop for testing different block sizes.
+*
+                        DO INB = 1, NNB
                            NB = NBVAL( INB )
                            CALL XLAENV( 1, NB )
                            CALL XLAENV( 3, NXVAL( INB ) )
 *
-                           DO 30 ITRAN = 1, 2
+*                          Loop for testing non-transposed and transposed.
+*
+                           DO ITRAN = 1, 2
                               IF( ITRAN.EQ.1 ) THEN
                                  TRANS = 'N'
                                  NROWS = M
@@ -484,15 +494,20 @@
      $                                        ITYPE, NFAIL, NERRS,
      $                                        NOUT )
 *
-*                             Check correctness of results
+*                             Test 1: Check correctness of results
+*                             for ZGELS, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
 *
-                              LDWORK = MAX( 1, NROWS )
                               IF( NROWS.GT.0 .AND. NRHS.GT.0 )
      $                           CALL ZLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, C, LDB )
                               CALL ZQRT16( TRANS, M, N, NRHS, COPYA,
      $                                     LDA, B, LDB, C, LDB, RWORK,
      $                                     RESULT( 1 ) )
+*
+*                             Test 2: Check correctness of results
+*                             for ZGELS.
 *
                               IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
      $                            ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
@@ -515,7 +530,7 @@
 *                             Print information about the tests that
 *                             did not pass the threshold.
 *
-                              DO 20 K = 1, 2
+                              DO K = 1, 2
                                  IF( RESULT( K ).GE.THRESH ) THEN
                                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                                 CALL ALAHD( NOUT, PATH )
@@ -524,26 +539,34 @@
      $                                 RESULT( K )
                                     NFAIL = NFAIL + 1
                                  END IF
-   20                         CONTINUE
+                              END DO
                               NRUN = NRUN + 2
-   30                      CONTINUE
-   40                   CONTINUE
-*
-*
-*                       Test ZGETSLS
+                           END DO
+                        END DO
+                     END IF
+*                 =====================================================
+*                       End test ZGELS
+*                 =====================================================
+*                 =====================================================
+*                       Begin test ZGELST
+*                 =====================================================
+                     IF( IRANK.EQ.1 ) THEN
 *
 *                       Generate a matrix of scaling type ISCALE
 *
                         CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
      $                               ISEED )
-                        DO 65 INB = 1, NNB
-                            MB = NBVAL( INB )
-                            CALL XLAENV( 1, MB )
-                             DO 62 IMB = 1, NNB
-                              NB = NBVAL( IMB )
-                              CALL XLAENV( 2, NB )
 *
-                           DO 60 ITRAN = 1, 2
+*                       Loop for testing different block sizes.
+*
+                        DO INB = 1, NNB
+                           NB = NBVAL( INB )
+                           CALL XLAENV( 1, NB )
+                           CALL XLAENV( 3, NXVAL( INB ) )
+*
+*                          Loop for testing non-transposed and transposed.
+*
+                           DO ITRAN = 1, 2
                               IF( ITRAN.EQ.1 ) THEN
                                  TRANS = 'N'
                                  NROWS = M
@@ -560,9 +583,9 @@
                               IF( NCOLS.GT.0 ) THEN
                                  CALL ZLARNV( 2, ISEED, NCOLS*NRHS,
      $                                        WORK )
-                                 CALL ZSCAL( NCOLS*NRHS,
-     $                                       CONE / DBLE( NCOLS ), WORK,
-     $                                       1 )
+                                 CALL ZDSCAL( NCOLS*NRHS,
+     $                                        ONE / DBLE( NCOLS ), WORK,
+     $                                        1 )
                               END IF
                               CALL ZGEMM( TRANS, 'No transpose', NROWS,
      $                                    NRHS, NCOLS, CONE, COPYA, LDA,
@@ -578,31 +601,37 @@
                                  CALL ZLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, B, LDB )
                               END IF
-                              SRNAMT = 'ZGETSLS '
-                              CALL ZGETSLS( TRANS, M, N, NRHS, A,
-     $                                 LDA, B, LDB, WORK, LWORK, INFO )
+                              SRNAMT = 'ZGELST'
+                              CALL ZGELST( TRANS, M, N, NRHS, A, LDA, B,
+     $                                    LDB, WORK, LWORK, INFO )
+*
                               IF( INFO.NE.0 )
-     $                           CALL ALAERH( PATH, 'ZGETSLS ', INFO, 0,
+     $                           CALL ALAERH( PATH, 'ZGELST', INFO, 0,
      $                                        TRANS, M, N, NRHS, -1, NB,
      $                                        ITYPE, NFAIL, NERRS,
      $                                        NOUT )
 *
-*                             Check correctness of results
+*                             Test 3: Check correctness of results
+*                             for ZGELST, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
 *
-                              LDWORK = MAX( 1, NROWS )
                               IF( NROWS.GT.0 .AND. NRHS.GT.0 )
      $                           CALL ZLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, C, LDB )
                               CALL ZQRT16( TRANS, M, N, NRHS, COPYA,
-     $                                     LDA, B, LDB, C, LDB, WORK2,
-     $                                     RESULT( 15 ) )
+     $                                     LDA, B, LDB, C, LDB, RWORK,
+     $                                     RESULT( 3 ) )
+*
+*                             Test 4: Check correctness of results
+*                             for ZGELST.
 *
                               IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
      $                            ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
 *
 *                                Solving LS system
 *
-                                 RESULT( 16 ) = ZQRT17( TRANS, 1, M, N,
+                                 RESULT( 4 ) = ZQRT17( TRANS, 1, M, N,
      $                                         NRHS, COPYA, LDA, B, LDB,
      $                                         COPYB, LDB, C, WORK,
      $                                         LWORK )
@@ -610,7 +639,7 @@
 *
 *                                Solving overdetermined system
 *
-                                 RESULT( 16 ) = ZQRT14( TRANS, M, N,
+                                 RESULT( 4 ) = ZQRT14( TRANS, M, N,
      $                                         NRHS, COPYA, LDA, B, LDB,
      $                                         WORK, LWORK )
                               END IF
@@ -618,21 +647,151 @@
 *                             Print information about the tests that
 *                             did not pass the threshold.
 *
-                              DO 50 K = 15, 16
+                              DO K = 3, 4
                                  IF( RESULT( K ).GE.THRESH ) THEN
                                     IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                                 CALL ALAHD( NOUT, PATH )
-                                    WRITE( NOUT, FMT = 9997 )TRANS, M,
-     $                                 N, NRHS, MB, NB, ITYPE, K,
+                                    WRITE( NOUT, FMT = 9999 )TRANS, M,
+     $                                 N, NRHS, NB, ITYPE, K,
      $                                 RESULT( K )
                                     NFAIL = NFAIL + 1
                                  END IF
-   50                         CONTINUE
+                              END DO
                               NRUN = NRUN + 2
-   60                      CONTINUE
-   62                      CONTINUE
-   65                   CONTINUE
+                           END DO
+                        END DO
+                     END IF
+*                 =====================================================
+*                       End test ZGELST
+*                 =====================================================
+*                 =====================================================
+*                       Begin test ZGELSTSLS
+*                 =====================================================
+                     IF( IRANK.EQ.1 ) THEN
+*
+*                       Generate a matrix of scaling type ISCALE
+*
+                        CALL ZQRT13( ISCALE, M, N, COPYA, LDA, NORMA,
+     $                               ISEED )
+*
+*                       Loop for testing different block sizes MB.
+*
+                        DO INB = 1, NNB
+                           MB = NBVAL( INB )
+                           CALL XLAENV( 1, MB )
+*
+*                          Loop for testing different block sizes NB.
+*
+                           DO IMB = 1, NNB
+                              NB = NBVAL( IMB )
+                              CALL XLAENV( 2, NB )
+*
+*                             Loop for testing non-transposed
+*                             and transposed.
+*
+                              DO ITRAN = 1, 2
+                                 IF( ITRAN.EQ.1 ) THEN
+                                    TRANS = 'N'
+                                    NROWS = M
+                                    NCOLS = N
+                                 ELSE
+                                    TRANS = 'C'
+                                    NROWS = N
+                                    NCOLS = M
+                                 END IF
+                                 LDWORK = MAX( 1, NCOLS )
+*
+*                                Set up a consistent rhs
+*
+                                 IF( NCOLS.GT.0 ) THEN
+                                    CALL ZLARNV( 2, ISEED, NCOLS*NRHS,
+     $                                           WORK )
+                                    CALL ZSCAL( NCOLS*NRHS,
+     $                                          CONE / DBLE( NCOLS ),
+     $                                          WORK, 1 )
+                                 END IF
+                                 CALL ZGEMM( TRANS, 'No transpose',
+     $                                       NROWS, NRHS, NCOLS, CONE,
+     $                                       COPYA, LDA, WORK, LDWORK,
+     $                                       CZERO, B, LDB )
+                                 CALL ZLACPY( 'Full', NROWS, NRHS,
+     $                                        B, LDB, COPYB, LDB )
+*
+*                                Solve LS or overdetermined system
+*
+                                 IF( M.GT.0 .AND. N.GT.0 ) THEN
+                                    CALL ZLACPY( 'Full', M, N,
+     $                                           COPYA, LDA, A, LDA )
+                                    CALL ZLACPY( 'Full', NROWS, NRHS,
+     $                                           COPYB, LDB, B, LDB )
+                                 END IF
+                                 SRNAMT = 'ZGETSLS '
+                                 CALL ZGETSLS( TRANS, M, N, NRHS, A,
+     $                                    LDA, B, LDB, WORK, LWORK,
+     $                                    INFO )
+                                 IF( INFO.NE.0 )
+     $                              CALL ALAERH( PATH, 'ZGETSLS ', INFO,
+     $                                           0, TRANS, M, N, NRHS,
+     $                                           -1, NB, ITYPE, NFAIL,
+     $                                           NERRS, NOUT )
+*
+*                             Test 5: Check correctness of results
+*                             for ZGETSLS, compute the residual:
+*                             RESID = norm(B - A*X) /
+*                             / ( max(m,n) * norm(A) * norm(X) * EPS )
+*
+                                 IF( NROWS.GT.0 .AND. NRHS.GT.0 )
+     $                              CALL ZLACPY( 'Full', NROWS, NRHS,
+     $                                           COPYB, LDB, C, LDB )
+                                 CALL ZQRT16( TRANS, M, N, NRHS,
+     $                                        COPYA, LDA, B, LDB,
+     $                                        C, LDB, WORK2,
+     $                                        RESULT( 5 ) )
+*
+*                             Test 6: Check correctness of results
+*                             for ZGETSLS.
+*
+                                 IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
+     $                               ( ITRAN.EQ.2 .AND. M.LT.N ) ) THEN
+*
+*                                   Solving LS system, compute:
+*                                   r = norm((B- A*X)**T * A) /
+*                                 / (norm(A)*norm(B)*max(M,N,NRHS)*EPS)
+*
+                                    RESULT( 6 ) = ZQRT17( TRANS, 1, M,
+     $                                             N, NRHS, COPYA, LDA,
+     $                                             B, LDB, COPYB, LDB,
+     $                                             C, WORK, LWORK )
+                                 ELSE
+*
+*                                   Solving overdetermined system
+*
+                                    RESULT( 6 ) = ZQRT14( TRANS, M, N,
+     $                                             NRHS, COPYA, LDA, B,
+     $                                             LDB, WORK, LWORK )
+                                 END IF
+*
+*                                Print information about the tests that
+*                                did not pass the threshold.
+*
+                                 DO K = 5, 6
+                                    IF( RESULT( K ).GE.THRESH ) THEN
+                                       IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                                    CALL ALAHD( NOUT, PATH )
+                                       WRITE( NOUT, FMT = 9997 )TRANS,
+     $                                    M, N, NRHS, MB, NB, ITYPE, K,
+     $                                    RESULT( K )
+                                          NFAIL = NFAIL + 1
+                                    END IF
+                                 END DO
+                                 NRUN = NRUN + 2
+                              END DO
+                           END DO
+                        END DO
                      END IF
+*                 =====================================================
+*                       End test ZGELSTSLS
+*                 =====================================================
 *
 *                    Generate a matrix of scaling type ISCALE and rank
 *                    type IRANK.
@@ -680,37 +839,37 @@
 *
 *                       workspace used: 2*MNMIN+NB*NB+NB*MAX(N,NRHS)
 *
-*                       Test 3:  Compute relative error in svd
+*                       Test 7:  Compute relative error in svd
 *                                workspace: M*N + 4*MIN(M,N) + MAX(M,N)
 *
-                        RESULT( 3 ) = ZQRT12( CRANK, CRANK, A, LDA,
+                        RESULT( 7 ) = ZQRT12( CRANK, CRANK, A, LDA,
      $                                COPYS, WORK, LWORK, RWORK )
 *
-*                       Test 4:  Compute error in solution
+*                       Test 8:  Compute error in solution
 *                                workspace:  M*NRHS + M
 *
                         CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK, RWORK,
-     $                               RESULT( 4 ) )
+     $                               RESULT( 8 ) )
 *
-*                       Test 5:  Check norm of r'*A
+*                       Test 9:  Check norm of r'*A
 *                                workspace: NRHS*(M+N)
 *
-                        RESULT( 5 ) = ZERO
+                        RESULT( 9 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 5 ) = ZQRT17( 'No transpose', 1, M,
+     $                     RESULT( 9 ) = ZQRT17( 'No transpose', 1, M,
      $                                   N, NRHS, COPYA, LDA, B, LDB,
      $                                   COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 6:  Check if x is in the rowspace of A
+*                       Test 10:  Check if x is in the rowspace of A
 *                                workspace: (M+NRHS)*(N+2)
 *
-                        RESULT( 6 ) = ZERO
+                        RESULT( 10 ) = ZERO
 *
                         IF( N.GT.CRANK )
-     $                     RESULT( 6 ) = ZQRT14( 'No transpose', M, N,
+     $                     RESULT( 10 ) = ZQRT14( 'No transpose', M, N,
      $                                   NRHS, COPYA, LDA, B, LDB,
      $                                   WORK, LWORK )
 *
@@ -736,38 +895,38 @@
 *                       workspace used: 3*min(m,n) +
 *                                       max(2*min(m,n),nrhs,max(m,n))
 *
-*                       Test 7:  Compute relative error in svd
+*                       Test 11:  Compute relative error in svd
 *
                         IF( RANK.GT.0 ) THEN
                            CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 )
-                           RESULT( 7 ) = DASUM( MNMIN, S, 1 ) /
+                           RESULT( 11 ) = DASUM( MNMIN, S, 1 ) /
      $                                   DASUM( MNMIN, COPYS, 1 ) /
      $                                   ( EPS*DBLE( MNMIN ) )
                         ELSE
-                           RESULT( 7 ) = ZERO
+                           RESULT( 11 ) = ZERO
                         END IF
 *
-*                       Test 8:  Compute error in solution
+*                       Test 12:  Compute error in solution
 *
                         CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK, RWORK,
-     $                               RESULT( 8 ) )
+     $                               RESULT( 12 ) )
 *
-*                       Test 9:  Check norm of r'*A
+*                       Test 13:  Check norm of r'*A
 *
-                        RESULT( 9 ) = ZERO
+                        RESULT( 13 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 9 ) = ZQRT17( 'No transpose', 1, M,
+     $                     RESULT( 13 ) = ZQRT17( 'No transpose', 1, M,
      $                                   N, NRHS, COPYA, LDA, B, LDB,
      $                                   COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 10:  Check if x is in the rowspace of A
+*                       Test 14:  Check if x is in the rowspace of A
 *
-                        RESULT( 10 ) = ZERO
+                        RESULT( 14 ) = ZERO
                         IF( N.GT.CRANK )
-     $                     RESULT( 10 ) = ZQRT14( 'No transpose', M, N,
+     $                     RESULT( 14 ) = ZQRT14( 'No transpose', M, N,
      $                                    NRHS, COPYA, LDA, B, LDB,
      $                                    WORK, LWORK )
 *
@@ -792,45 +951,45 @@
      $                                  N, NRHS, -1, NB, ITYPE, NFAIL,
      $                                  NERRS, NOUT )
 *
-*                       Test 11:  Compute relative error in svd
+*                       Test 15:  Compute relative error in svd
 *
                         IF( RANK.GT.0 ) THEN
                            CALL DAXPY( MNMIN, -ONE, COPYS, 1, S, 1 )
-                           RESULT( 11 ) = DASUM( MNMIN, S, 1 ) /
+                           RESULT( 15 ) = DASUM( MNMIN, S, 1 ) /
      $                                    DASUM( MNMIN, COPYS, 1 ) /
      $                                    ( EPS*DBLE( MNMIN ) )
                         ELSE
-                           RESULT( 11 ) = ZERO
+                           RESULT( 15 ) = ZERO
                         END IF
 *
-*                       Test 12:  Compute error in solution
+*                       Test 16:  Compute error in solution
 *
                         CALL ZLACPY( 'Full', M, NRHS, COPYB, LDB, WORK,
      $                               LDWORK )
                         CALL ZQRT16( 'No transpose', M, N, NRHS, COPYA,
      $                               LDA, B, LDB, WORK, LDWORK, RWORK,
-     $                               RESULT( 12 ) )
+     $                               RESULT( 16 ) )
 *
-*                       Test 13:  Check norm of r'*A
+*                       Test 17:  Check norm of r'*A
 *
-                        RESULT( 13 ) = ZERO
+                        RESULT( 17 ) = ZERO
                         IF( M.GT.CRANK )
-     $                     RESULT( 13 ) = ZQRT17( 'No transpose', 1, M,
+     $                     RESULT( 17 ) = ZQRT17( 'No transpose', 1, M,
      $                                    N, NRHS, COPYA, LDA, B, LDB,
      $                                    COPYB, LDB, C, WORK, LWORK )
 *
-*                       Test 14:  Check if x is in the rowspace of A
+*                       Test 18:  Check if x is in the rowspace of A
 *
-                        RESULT( 14 ) = ZERO
+                        RESULT( 18 ) = ZERO
                         IF( N.GT.CRANK )
-     $                     RESULT( 14 ) = ZQRT14( 'No transpose', M, N,
+     $                     RESULT( 18 ) = ZQRT14( 'No transpose', M, N,
      $                                    NRHS, COPYA, LDA, B, LDB,
      $                                    WORK, LWORK )
 *
 *                       Print information about the tests that did not
 *                       pass the threshold.
 *
-                        DO 80 K = 3, 14
+                        DO 80 K = 7, 18
                            IF( RESULT( K ).GE.THRESH ) THEN
                               IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
      $                           CALL ALAHD( NOUT, PATH )
diff --git a/lapack-netlib/TESTING/LIN/zerrls.f b/lapack-netlib/TESTING/LIN/zerrls.f
index 66e56c8c6..22f049ee0 100644
--- a/lapack-netlib/TESTING/LIN/zerrls.f
+++ b/lapack-netlib/TESTING/LIN/zerrls.f
@@ -22,7 +22,7 @@
 *> \verbatim
 *>
 *> ZERRLS tests the error exits for the COMPLEX*16 least squares
-*> driver routines (ZGELS, CGELSS, CGELSY, CGELSD).
+*> driver routines (ZGELS, ZGELST, ZGETSLS, CGELSS, CGELSY, CGELSD).
 *> \endverbatim
 *
 *  Arguments:
@@ -83,7 +83,8 @@
       EXTERNAL           LSAMEN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, ZGELS, ZGELSD, ZGELSS, ZGELSY
+      EXTERNAL           ALAESM, CHKXER, ZGELS, ZGELSD, ZGELSS, ZGELST,
+     $                   ZGELSY, ZGETSLS
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -130,10 +131,66 @@
          INFOT = 8
          CALL ZGELS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
          CALL CHKXER( 'ZGELS ', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZGELS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'ZGELS', INFOT, NOUT, LERR, OK )
          INFOT = 10
          CALL ZGELS( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZGELS ', INFOT, NOUT, LERR, OK )
 *
+*        ZGELST
+*
+         SRNAMT = 'ZGELST'
+         INFOT = 1
+         CALL ZGELST( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZGELST( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZGELST( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZGELST( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL ZGELST( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO )
+         CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZGELST( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
+         CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZGELST( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZGELST( 'N', 1, 1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZGELST', INFOT, NOUT, LERR, OK )
+*
+*        ZGETSLS
+*
+         SRNAMT = 'ZGETSLS'
+         INFOT = 1
+         CALL ZGETSLS( '/', 0, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZGETSLS( 'N', -1, 0, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZGETSLS( 'N', 0, -1, 0, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZGETSLS( 'N', 0, 0, -1, A, 1, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL ZGETSLS( 'N', 2, 0, 0, A, 1, B, 2, W, 2, INFO )
+         CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZGETSLS( 'N', 2, 0, 0, A, 2, B, 1, W, 2, INFO )
+         CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZGETSLS( 'N', 0, 2, 0, A, 1, B, 1, W, 2, INFO )
+         CALL CHKXER( 'ZGETSLS', INFOT, NOUT, LERR, OK )
+*
 *        ZGELSS
 *
          SRNAMT = 'ZGELSS'

From 1d32ce51359145d94eb6d592f8f5d43437f1a9f0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 22:42:50 +0100
Subject: [PATCH 116/154] Add ?GELST (Reference-LAPACK PR739)

---
 lapack-netlib/SRC/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 49eb69cfe..49798b0c5 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -207,7 +207,7 @@ SLASRC_O = \
    ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \
    ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \
    ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \
-   sgesvdq.o slarmm.o slatrs3.o strsyl3.o 
+   sgesvdq.o slarmm.o slatrs3.o strsyl3.o sgelst.o
    
 endif
 
@@ -316,7 +316,7 @@ CLASRC_O = \
    chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \
    cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \
    chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \
-   cgesvdq.o clatrs3.o ctrsyl3.o
+   cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o
 endif
 
 ifdef USEXBLAS
@@ -417,7 +417,7 @@ DLASRC_O = \
    dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \
    dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \
    dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \
-   dgesvdq.o dlarmm.o dlatrs3.o dtrsyl3.o
+   dgesvdq.o dlarmm.o dlatrs3.o dtrsyl3.o dgelst.o
 endif
 
 ifdef USEXBLAS
@@ -526,7 +526,7 @@ ZLASRC_O = \
    zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
    zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \
    zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \
-   zgesvdq.o zlatrs3.o ztrsyl3.o
+   zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o
 endif
 
 ifdef USEXBLAS

From 5ff46f40921b287c3f34d86770de56413f214680 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 22:49:31 +0100
Subject: [PATCH 117/154] Add ?GELST (Reference-LAPACK PR739)

---
 cmake/lapack.cmake | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index ca3a1e184..8a5ff22ec 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -124,7 +124,7 @@ set(SLASRC
    ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
    sgesvdq.f slaorhr_col_getrfnp.f
    slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f 
-   slarmm.f slatrs3.f strsyl3.f)
+   slarmm.f slatrs3.f strsyl3.f sgelst.f)
 
 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
    sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -223,7 +223,7 @@ set(CLASRC
    chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
    cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
    cungtsqr.f cungtsqr_row.f cunhr_col.f 
-   clatrs3.f ctrsyl3.f )
+   clatrs3.f ctrsyl3.f cgelst.f)
 
 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
    cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -316,7 +316,7 @@ set(DLASRC
    dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
    dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
    dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f 
-   dlarmm.f dlatrs3.f dtrsyl3.f)
+   dlarmm.f dlatrs3.f dtrsyl3.f dgelst.f)
 
 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
    dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -419,7 +419,7 @@ set(ZLASRC
    zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
    zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
    zungtsqr.f zungtsqr_row.f zunhr_col.f
-   zlatrs3.f ztrsyl3.f)
+   zlatrs3.f ztrsyl3.f zgelst.f)
 
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
    zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
@@ -622,7 +622,7 @@ set(SLASRC
    ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
    sgesvdq.c slaorhr_col_getrfnp.c
    slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c 
-   slarmm.c slatrs3.c strsyl3.c)
+   slarmm.c slatrs3.c strsyl3.c sgelst.c)
 
 set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
    sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
@@ -638,7 +638,7 @@ set(CLASRC
    cgecon.c cgeequ.c cgees.c  cgeesx.c cgeev.c  cgeevx.c
    cgehd2.c cgehrd.c cgelq2.c cgelqf.c
    cgels.c  cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
-   cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
+   cgeqr2.c cgeqr2p.c cgeqrf.c fcgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
    cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
    cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
    cgesvx.c cgetc2.c cgetrf2.c
@@ -720,7 +720,7 @@ set(CLASRC
    chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
    cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c 
    cungtsqr.c cungtsqr_row.c cunhr_col.c 
-   clatrs3.c ctrsyl3.c)
+   clatrs3.c ctrsyl3.c cgelst.c)
 
 set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
    cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
@@ -812,7 +812,7 @@ set(DLASRC
    dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
    dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
    dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c 
-   dlarmm.c dlatrs3.c dtrsyl3.c)
+   dlarmm.c dlatrs3.c dtrsyl3.c dgelst.c)
 
 set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
    dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
@@ -913,7 +913,7 @@ set(ZLASRC
    zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
    zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
    zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
-   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c)
+   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
 
 set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
    zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c

From f157d6d6718493215ae9ab915a9202a1018bbaf0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 22:50:57 +0100
Subject: [PATCH 118/154] Add C equivalents of ?GELST (for Reference-LAPACK
 PR739)

---
 lapack-netlib/SRC/cgelst.c | 1108 +++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/dgelst.c | 1104 +++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/sgelst.c | 1099 +++++++++++++++++++++++++++++++++++
 lapack-netlib/SRC/zgelst.c | 1115 ++++++++++++++++++++++++++++++++++++
 4 files changed, 4426 insertions(+)
 create mode 100644 lapack-netlib/SRC/cgelst.c
 create mode 100644 lapack-netlib/SRC/dgelst.c
 create mode 100644 lapack-netlib/SRC/sgelst.c
 create mode 100644 lapack-netlib/SRC/zgelst.c

diff --git a/lapack-netlib/SRC/cgelst.c b/lapack-netlib/SRC/cgelst.c
new file mode 100644
index 000000000..48ded643d
--- /dev/null
+++ b/lapack-netlib/SRC/cgelst.c
@@ -0,0 +1,1108 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+
+
+/* Table of constant values */
+
+static complex c_b1 = {0.f,0.f};
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__2 = 2;
+static integer c__0 = 0;
+
+/* > \brief <b> CGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori
+zation with compact WY representation of Q.</b> */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download CGELST + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgelst.
+f"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgelst.
+f"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgelst.
+f"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       CHARACTER          TRANS */
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       COMPLEX            A( LDA, * ), B( LDB, * ), WORK( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > CGELST solves overdetermined or underdetermined real linear systems */
+/* > involving an M-by-N matrix A, or its conjugate-transpose, using a QR */
+/* > or LQ factorization of A with compact WY representation of Q. */
+/* > It is assumed that A has full rank. */
+/* > */
+/* > The following options are provided: */
+/* > */
+/* > 1. If TRANS = 'N' and m >= n:  find the least squares solution of */
+/* >    an overdetermined system, i.e., solve the least squares problem */
+/* >                 minimize || B - A*X ||. */
+/* > */
+/* > 2. If TRANS = 'N' and m < n:  find the minimum norm solution of */
+/* >    an underdetermined system A * X = B. */
+/* > */
+/* > 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of */
+/* >    an underdetermined system A**T * X = B. */
+/* > */
+/* > 4. If TRANS = 'C' and m < n:  find the least squares solution of */
+/* >    an overdetermined system, i.e., solve the least squares problem */
+/* >                 minimize || B - A**T * X ||. */
+/* > */
+/* > Several right hand side vectors b and solution vectors x can be */
+/* > handled in a single call; they are stored as the columns of the */
+/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */
+/* > matrix X. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] TRANS */
+/* > \verbatim */
+/* >          TRANS is CHARACTER*1 */
+/* >          = 'N': the linear system involves A; */
+/* >          = 'C': the linear system involves A**H. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of right hand sides, i.e., the number of */
+/* >          columns of the matrices B and X. NRHS >=0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] A */
+/* > \verbatim */
+/* >          A is COMPLEX array, dimension (LDA,N) */
+/* >          On entry, the M-by-N matrix A. */
+/* >          On exit, */
+/* >            if M >= N, A is overwritten by details of its QR */
+/* >                       factorization as returned by CGEQRT; */
+/* >            if M <  N, A is overwritten by details of its LQ */
+/* >                       factorization as returned by CGELQT. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= f2cmax(1,M). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is COMPLEX array, dimension (LDB,NRHS) */
+/* >          On entry, the matrix B of right hand side vectors, stored */
+/* >          columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */
+/* >          if TRANS = 'C'. */
+/* >          On exit, if INFO = 0, B is overwritten by the solution */
+/* >          vectors, stored columnwise: */
+/* >          if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */
+/* >          squares solution vectors; the residual sum of squares for the */
+/* >          solution in each column is given by the sum of squares of */
+/* >          modulus of elements N+1 to M in that column; */
+/* >          if TRANS = 'N' and m < n, rows 1 to N of B contain the */
+/* >          minimum norm solution vectors; */
+/* >          if TRANS = 'C' and m >= n, rows 1 to M of B contain the */
+/* >          minimum norm solution vectors; */
+/* >          if TRANS = 'C' and m < n, rows 1 to M of B contain the */
+/* >          least squares solution vectors; the residual sum of squares */
+/* >          for the solution in each column is given by the sum of */
+/* >          squares of the modulus of elements M+1 to N in that column. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= MAX(1,M,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is COMPLEX array, dimension (MAX(1,LWORK)) */
+/* >          On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The dimension of the array WORK. */
+/* >          LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */
+/* >          For optimal performance, */
+/* >          LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */
+/* >          where MN = f2cmin(M,N) and NB is the optimum block size. */
+/* > */
+/* >          If LWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal size of the WORK array, returns */
+/* >          this value as the first entry of the WORK array, and no error */
+/* >          message related to LWORK is issued by XERBLA. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0:  successful exit */
+/* >          < 0:  if INFO = -i, the i-th argument had an illegal value */
+/* >          > 0:  if INFO =  i, the i-th diagonal element of the */
+/* >                triangular factor of A is zero, so that A does not have */
+/* >                full rank; the least squares solution could not be */
+/* >                computed. */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complexGEsolve */
+
+/* > \par Contributors: */
+/*  ================== */
+/* > */
+/* > \verbatim */
+/* > */
+/* >  November 2022,  Igor Kozachenko, */
+/* >                  Computer Science Division, */
+/* >                  University of California, Berkeley */
+/* > \endverbatim */
+
+/*  ===================================================================== */
+/* Subroutine */ int cgelst_(char *trans, integer *m, integer *n, integer *
+	nrhs, complex *a, integer *lda, complex *b, integer *ldb, complex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3;
+    real r__1;
+
+    /* Local variables */
+    real anrm, bnrm;
+    integer brow;
+    logical tpsd;
+    integer i__, j, iascl, ibscl;
+    extern logical lsame_(char *, char *);
+    integer nbmin;
+    real rwork[1];
+    integer lwopt, nb;
+    extern /* Subroutine */ int slabad_(real *, real *);
+    extern real clange_(char *, integer *, integer *, complex *, integer *, 
+	    real *);
+    integer mn;
+    extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, 
+	    real *, integer *, integer *, complex *, integer *, integer *);
+    extern real slamch_(char *);
+    extern /* Subroutine */ int claset_(char *, integer *, integer *, complex 
+	    *, complex *, complex *, integer *), xerbla_(char *, 
+	    integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int cgelqt_(integer *, integer *, integer *, 
+	    complex *, integer *, complex *, integer *, complex *, integer *);
+    integer scllen;
+    real bignum;
+    extern /* Subroutine */ int cgeqrt_(integer *, integer *, integer *, 
+	    complex *, integer *, complex *, integer *, complex *, integer *);
+    integer mnnrhs;
+    real smlnum;
+    logical lquery;
+    extern /* Subroutine */ int ctrtrs_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, integer *), cgemlqt_(char *, char *, integer *, 
+	    integer *, integer *, integer *, complex *, integer *, complex *, 
+	    integer *, complex *, integer *, complex *, integer *), cgemqrt_(char *, char *, integer *, integer *, integer *,
+	     integer *, complex *, integer *, complex *, integer *, complex *,
+	     integer *, complex *, integer *);
+
+
+/*  -- LAPACK driver routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    mn = f2cmin(*m,*n);
+    lquery = *lwork == -1;
+    if (! (lsame_(trans, "N") || lsame_(trans, "C"))) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 0) {
+	*info = -4;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = f2cmax(1,*m);
+	if (*ldb < f2cmax(i__1,*n)) {
+	    *info = -8;
+	} else /* if(complicated condition) */ {
+/* Computing MAX */
+	    i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs);
+	    if (*lwork < f2cmax(i__1,i__2) && ! lquery) {
+		*info = -10;
+	    }
+	}
+    }
+
+/*     Figure out optimal block size and optimal workspace size */
+
+    if (*info == 0 || *info == -10) {
+
+	tpsd = TRUE_;
+	if (lsame_(trans, "N")) {
+	    tpsd = FALSE_;
+	}
+
+	nb = ilaenv_(&c__1, "CGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (
+		ftnlen)1);
+
+	mnnrhs = f2cmax(mn,*nrhs);
+/* Computing MAX */
+	i__1 = 1, i__2 = (mn + mnnrhs) * nb;
+	lwopt = f2cmax(i__1,i__2);
+	r__1 = (real) lwopt;
+	work[1].r = r__1, work[1].i = 0.f;
+
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGELST ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+/* Computing MIN */
+    i__1 = f2cmin(*m,*n);
+    if (f2cmin(i__1,*nrhs) == 0) {
+	i__1 = f2cmax(*m,*n);
+	claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb);
+	r__1 = (real) lwopt;
+	work[1].r = r__1, work[1].i = 0.f;
+	return 0;
+    }
+
+/*     *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */
+
+    if (nb > mn) {
+	nb = mn;
+    }
+
+/*     Determine the block size from the supplied LWORK */
+/*     ( at this stage we know that LWORK >= (minimum required workspace, */
+/*     but it may be less than optimal) */
+
+/* Computing MIN */
+    i__1 = nb, i__2 = *lwork / (mn + mnnrhs);
+    nb = f2cmin(i__1,i__2);
+
+/*     The minimum value of NB, when blocked code is used */
+
+/* Computing MAX */
+    i__1 = 2, i__2 = ilaenv_(&c__2, "CGELST", " ", m, n, &c_n1, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nbmin = f2cmax(i__1,i__2);
+
+    if (nb < nbmin) {
+	nb = 1;
+    }
+
+/*     Get machine parameters */
+
+    smlnum = slamch_("S") / slamch_("P");
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
+
+/*     Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */
+
+    anrm = clange_("M", m, n, &a[a_offset], lda, rwork);
+    iascl = 0;
+    if (anrm > 0.f && anrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	clascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, 
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM */
+
+	clascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, 
+		info);
+	iascl = 2;
+    } else if (anrm == 0.f) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = f2cmax(*m,*n);
+	claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb);
+	r__1 = (real) lwopt;
+	work[1].r = r__1, work[1].i = 0.f;
+	return 0;
+    }
+
+    brow = *m;
+    if (tpsd) {
+	brow = *n;
+    }
+    bnrm = clange_("M", &brow, nrhs, &b[b_offset], ldb, rwork);
+    ibscl = 0;
+    if (bnrm > 0.f && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	clascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], 
+		ldb, info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM */
+
+	clascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], 
+		ldb, info);
+	ibscl = 2;
+    }
+
+    if (*m >= *n) {
+
+/*        M > N: */
+/*        Compute the blocked QR factorization of A, */
+/*        using the compact WY representation of Q, */
+/*        workspace at least N, optimally N*NB. */
+
+	cgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + 
+		1], info);
+
+	if (! tpsd) {
+
+/*           M > N, A is not transposed: */
+/*           Overdetermined system of equations, */
+/*           least-squares problem, f2cmin || A * X - B ||. */
+
+/*           Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    cgemqrt_("Left", "Conjugate transpose", m, nrhs, n, &nb, &a[
+		    a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[
+		    mn * nb + 1], info);
+
+/*           Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */
+
+	    ctrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset]
+		    , lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+	    scllen = *n;
+
+	} else {
+
+/*           M > N, A is transposed: */
+/*           Underdetermined system of equations, */
+/*           minimum norm solution of A**T * X = B. */
+
+/*           Compute B := inv(R**T) * B in two row blocks of B. */
+
+/*           Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */
+
+	    ctrtrs_("Upper", "Conjugate transpose", "Non-unit", n, nrhs, &a[
+		    a_offset], lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+/*           Block 2: Zero out all rows below the N-th row in B: */
+/*           B(N+1:M,1:NRHS) = ZERO */
+
+	    i__1 = *nrhs;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = *n + 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * b_dim1;
+		    b[i__3].r = 0.f, b[i__3].i = 0.f;
+		}
+	    }
+
+/*           Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    cgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], 
+		    lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1],
+		     info);
+
+	    scllen = *m;
+
+	}
+
+    } else {
+
+/*        M < N: */
+/*        Compute the blocked LQ factorization of A, */
+/*        using the compact WY representation of Q, */
+/*        workspace at least M, optimally M*NB. */
+
+	cgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + 
+		1], info);
+
+	if (! tpsd) {
+
+/*           M < N, A is not transposed: */
+/*           Underdetermined system of equations, */
+/*           minimum norm solution of A * X = B. */
+
+/*           Compute B := inv(L) * B in two row blocks of B. */
+
+/*           Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */
+
+	    ctrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset]
+		    , lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+/*           Block 2: Zero out all rows below the M-th row in B: */
+/*           B(M+1:N,1:NRHS) = ZERO */
+
+	    i__1 = *nrhs;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * b_dim1;
+		    b[i__3].r = 0.f, b[i__3].i = 0.f;
+		}
+	    }
+
+/*           Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    cgemlqt_("Left", "Conjugate transpose", n, nrhs, m, &nb, &a[
+		    a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[
+		    mn * nb + 1], info);
+
+	    scllen = *n;
+
+	} else {
+
+/*           M < N, A is transposed: */
+/*           Overdetermined system of equations, */
+/*           least-squares problem, f2cmin || A**T * X - B ||. */
+
+/*           Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    cgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], 
+		    lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1],
+		     info);
+
+/*           Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */
+
+	    ctrtrs_("Lower", "Conjugate transpose", "Non-unit", m, nrhs, &a[
+		    a_offset], lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+	    scllen = *m;
+
+	}
+
+    }
+
+/*     Undo scaling */
+
+    if (iascl == 1) {
+	clascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    } else if (iascl == 2) {
+	clascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    }
+    if (ibscl == 1) {
+	clascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    } else if (ibscl == 2) {
+	clascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    }
+
+    r__1 = (real) lwopt;
+    work[1].r = r__1, work[1].i = 0.f;
+
+    return 0;
+
+/*     End of CGELST */
+
+} /* cgelst_ */
+
diff --git a/lapack-netlib/SRC/dgelst.c b/lapack-netlib/SRC/dgelst.c
new file mode 100644
index 000000000..9327da4dd
--- /dev/null
+++ b/lapack-netlib/SRC/dgelst.c
@@ -0,0 +1,1104 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static doublereal c_b12 = 0.;
+static integer c__2 = 2;
+static integer c__0 = 0;
+
+/* > \brief <b> DGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori
+zation with compact WY representation of Q.</b> */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download DGELST + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgelst.
+f"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgelst.
+f"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgelst.
+f"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE DGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       CHARACTER          TRANS */
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), WORK( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > DGELST solves overdetermined or underdetermined real linear systems */
+/* > involving an M-by-N matrix A, or its transpose, using a QR or LQ */
+/* > factorization of A with compact WY representation of Q. */
+/* > It is assumed that A has full rank. */
+/* > */
+/* > The following options are provided: */
+/* > */
+/* > 1. If TRANS = 'N' and m >= n:  find the least squares solution of */
+/* >    an overdetermined system, i.e., solve the least squares problem */
+/* >                 minimize || B - A*X ||. */
+/* > */
+/* > 2. If TRANS = 'N' and m < n:  find the minimum norm solution of */
+/* >    an underdetermined system A * X = B. */
+/* > */
+/* > 3. If TRANS = 'T' and m >= n:  find the minimum norm solution of */
+/* >    an underdetermined system A**T * X = B. */
+/* > */
+/* > 4. If TRANS = 'T' and m < n:  find the least squares solution of */
+/* >    an overdetermined system, i.e., solve the least squares problem */
+/* >                 minimize || B - A**T * X ||. */
+/* > */
+/* > Several right hand side vectors b and solution vectors x can be */
+/* > handled in a single call; they are stored as the columns of the */
+/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */
+/* > matrix X. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] TRANS */
+/* > \verbatim */
+/* >          TRANS is CHARACTER*1 */
+/* >          = 'N': the linear system involves A; */
+/* >          = 'T': the linear system involves A**T. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of right hand sides, i.e., the number of */
+/* >          columns of the matrices B and X. NRHS >=0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] A */
+/* > \verbatim */
+/* >          A is DOUBLE PRECISION array, dimension (LDA,N) */
+/* >          On entry, the M-by-N matrix A. */
+/* >          On exit, */
+/* >            if M >= N, A is overwritten by details of its QR */
+/* >                       factorization as returned by DGEQRT; */
+/* >            if M <  N, A is overwritten by details of its LQ */
+/* >                       factorization as returned by DGELQT. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= f2cmax(1,M). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is DOUBLE PRECISION array, dimension (LDB,NRHS) */
+/* >          On entry, the matrix B of right hand side vectors, stored */
+/* >          columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */
+/* >          if TRANS = 'T'. */
+/* >          On exit, if INFO = 0, B is overwritten by the solution */
+/* >          vectors, stored columnwise: */
+/* >          if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */
+/* >          squares solution vectors; the residual sum of squares for the */
+/* >          solution in each column is given by the sum of squares of */
+/* >          elements N+1 to M in that column; */
+/* >          if TRANS = 'N' and m < n, rows 1 to N of B contain the */
+/* >          minimum norm solution vectors; */
+/* >          if TRANS = 'T' and m >= n, rows 1 to M of B contain the */
+/* >          minimum norm solution vectors; */
+/* >          if TRANS = 'T' and m < n, rows 1 to M of B contain the */
+/* >          least squares solution vectors; the residual sum of squares */
+/* >          for the solution in each column is given by the sum of */
+/* >          squares of elements M+1 to N in that column. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= MAX(1,M,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)) */
+/* >          On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The dimension of the array WORK. */
+/* >          LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */
+/* >          For optimal performance, */
+/* >          LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */
+/* >          where MN = f2cmin(M,N) and NB is the optimum block size. */
+/* > */
+/* >          If LWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal size of the WORK array, returns */
+/* >          this value as the first entry of the WORK array, and no error */
+/* >          message related to LWORK is issued by XERBLA. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0:  successful exit */
+/* >          < 0:  if INFO = -i, the i-th argument had an illegal value */
+/* >          > 0:  if INFO =  i, the i-th diagonal element of the */
+/* >                triangular factor of A is zero, so that A does not have */
+/* >                full rank; the least squares solution could not be */
+/* >                computed. */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup doubleGEsolve */
+
+/* > \par Contributors: */
+/*  ================== */
+/* > */
+/* > \verbatim */
+/* > */
+/* >  November 2022,  Igor Kozachenko, */
+/* >                  Computer Science Division, */
+/* >                  University of California, Berkeley */
+/* > \endverbatim */
+
+/*  ===================================================================== */
+/* Subroutine */ int dgelst_(char *trans, integer *m, integer *n, integer *
+	nrhs, doublereal *a, integer *lda, doublereal *b, integer *ldb, 
+	doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
+
+    /* Local variables */
+    doublereal anrm, bnrm;
+    integer brow;
+    logical tpsd;
+    integer i__, j, iascl, ibscl;
+    extern logical lsame_(char *, char *);
+    integer nbmin;
+    doublereal rwork[1];
+    integer lwopt;
+    extern /* Subroutine */ int dlabad_(doublereal *, doublereal *);
+    integer nb;
+    extern doublereal dlamch_(char *), dlange_(char *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *);
+    integer mn;
+    extern /* Subroutine */ int dlascl_(char *, integer *, integer *, 
+	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
+	    integer *, integer *), dlaset_(char *, integer *, integer 
+	    *, doublereal *, doublereal *, doublereal *, integer *), 
+	    xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    integer scllen;
+    doublereal bignum;
+    extern /* Subroutine */ int dgelqt_(integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *), dgeqrt_(integer *, integer *, integer *, doublereal *,
+	     integer *, doublereal *, integer *, doublereal *, integer *);
+    integer mnnrhs;
+    doublereal smlnum;
+    logical lquery;
+    extern /* Subroutine */ int dtrtrs_(char *, char *, char *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *, integer *, 
+	    integer *), dgemlqt_(char *, char *, 
+	    integer *, integer *, integer *, integer *, doublereal *, integer 
+	    *, doublereal *, integer *, doublereal *, integer *, doublereal *,
+	     integer *), dgemqrt_(char *, char *, integer *, 
+	    integer *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *);
+
+
+/*  -- LAPACK driver routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    mn = f2cmin(*m,*n);
+    lquery = *lwork == -1;
+    if (! (lsame_(trans, "N") || lsame_(trans, "T"))) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 0) {
+	*info = -4;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = f2cmax(1,*m);
+	if (*ldb < f2cmax(i__1,*n)) {
+	    *info = -8;
+	} else /* if(complicated condition) */ {
+/* Computing MAX */
+	    i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs);
+	    if (*lwork < f2cmax(i__1,i__2) && ! lquery) {
+		*info = -10;
+	    }
+	}
+    }
+
+/*     Figure out optimal block size and optimal workspace size */
+
+    if (*info == 0 || *info == -10) {
+
+	tpsd = TRUE_;
+	if (lsame_(trans, "N")) {
+	    tpsd = FALSE_;
+	}
+
+	nb = ilaenv_(&c__1, "DGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (
+		ftnlen)1);
+
+	mnnrhs = f2cmax(mn,*nrhs);
+/* Computing MAX */
+	i__1 = 1, i__2 = (mn + mnnrhs) * nb;
+	lwopt = f2cmax(i__1,i__2);
+	work[1] = (doublereal) lwopt;
+
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGELST ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+/* Computing MIN */
+    i__1 = f2cmin(*m,*n);
+    if (f2cmin(i__1,*nrhs) == 0) {
+	i__1 = f2cmax(*m,*n);
+	dlaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb);
+	work[1] = (doublereal) lwopt;
+	return 0;
+    }
+
+/*     *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */
+
+    if (nb > mn) {
+	nb = mn;
+    }
+
+/*     Determine the block size from the supplied LWORK */
+/*     ( at this stage we know that LWORK >= (minimum required workspace, */
+/*     but it may be less than optimal) */
+
+/* Computing MIN */
+    i__1 = nb, i__2 = *lwork / (mn + mnnrhs);
+    nb = f2cmin(i__1,i__2);
+
+/*     The minimum value of NB, when blocked code is used */
+
+/* Computing MAX */
+    i__1 = 2, i__2 = ilaenv_(&c__2, "DGELST", " ", m, n, &c_n1, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nbmin = f2cmax(i__1,i__2);
+
+    if (nb < nbmin) {
+	nb = 1;
+    }
+
+/*     Get machine parameters */
+
+    smlnum = dlamch_("S") / dlamch_("P");
+    bignum = 1. / smlnum;
+    dlabad_(&smlnum, &bignum);
+
+/*     Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */
+
+    anrm = dlange_("M", m, n, &a[a_offset], lda, rwork);
+    iascl = 0;
+    if (anrm > 0. && anrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	dlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, 
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM */
+
+	dlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, 
+		info);
+	iascl = 2;
+    } else if (anrm == 0.) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = f2cmax(*m,*n);
+	dlaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb);
+	work[1] = (doublereal) lwopt;
+	return 0;
+    }
+
+    brow = *m;
+    if (tpsd) {
+	brow = *n;
+    }
+    bnrm = dlange_("M", &brow, nrhs, &b[b_offset], ldb, rwork);
+    ibscl = 0;
+    if (bnrm > 0. && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	dlascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], 
+		ldb, info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM */
+
+	dlascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], 
+		ldb, info);
+	ibscl = 2;
+    }
+
+    if (*m >= *n) {
+
+/*        M > N: */
+/*        Compute the blocked QR factorization of A, */
+/*        using the compact WY representation of Q, */
+/*        workspace at least N, optimally N*NB. */
+
+	dgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + 
+		1], info);
+
+	if (! tpsd) {
+
+/*           M > N, A is not transposed: */
+/*           Overdetermined system of equations, */
+/*           least-squares problem, f2cmin || A * X - B ||. */
+
+/*           Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    dgemqrt_("Left", "Transpose", m, nrhs, n, &nb, &a[a_offset], lda, 
+		    &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], 
+		    info);
+
+/*           Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */
+
+	    dtrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset]
+		    , lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+	    scllen = *n;
+
+	} else {
+
+/*           M > N, A is transposed: */
+/*           Underdetermined system of equations, */
+/*           minimum norm solution of A**T * X = B. */
+
+/*           Compute B := inv(R**T) * B in two row blocks of B. */
+
+/*           Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */
+
+	    dtrtrs_("Upper", "Transpose", "Non-unit", n, nrhs, &a[a_offset], 
+		    lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+/*           Block 2: Zero out all rows below the N-th row in B: */
+/*           B(N+1:M,1:NRHS) = ZERO */
+
+	    i__1 = *nrhs;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = *n + 1; i__ <= i__2; ++i__) {
+		    b[i__ + j * b_dim1] = 0.;
+		}
+	    }
+
+/*           Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    dgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], 
+		    lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1],
+		     info);
+
+	    scllen = *m;
+
+	}
+
+    } else {
+
+/*        M < N: */
+/*        Compute the blocked LQ factorization of A, */
+/*        using the compact WY representation of Q, */
+/*        workspace at least M, optimally M*NB. */
+
+	dgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + 
+		1], info);
+
+	if (! tpsd) {
+
+/*           M < N, A is not transposed: */
+/*           Underdetermined system of equations, */
+/*           minimum norm solution of A * X = B. */
+
+/*           Compute B := inv(L) * B in two row blocks of B. */
+
+/*           Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */
+
+	    dtrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset]
+		    , lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+/*           Block 2: Zero out all rows below the M-th row in B: */
+/*           B(M+1:N,1:NRHS) = ZERO */
+
+	    i__1 = *nrhs;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		    b[i__ + j * b_dim1] = 0.;
+		}
+	    }
+
+/*           Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    dgemlqt_("Left", "Transpose", n, nrhs, m, &nb, &a[a_offset], lda, 
+		    &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], 
+		    info);
+
+	    scllen = *n;
+
+	} else {
+
+/*           M < N, A is transposed: */
+/*           Overdetermined system of equations, */
+/*           least-squares problem, f2cmin || A**T * X - B ||. */
+
+/*           Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    dgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], 
+		    lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1],
+		     info);
+
+/*           Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */
+
+	    dtrtrs_("Lower", "Transpose", "Non-unit", m, nrhs, &a[a_offset], 
+		    lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+	    scllen = *m;
+
+	}
+
+    }
+
+/*     Undo scaling */
+
+    if (iascl == 1) {
+	dlascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    } else if (iascl == 2) {
+	dlascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    }
+    if (ibscl == 1) {
+	dlascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    } else if (ibscl == 2) {
+	dlascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    }
+
+    work[1] = (doublereal) lwopt;
+
+    return 0;
+
+/*     End of DGELST */
+
+} /* dgelst_ */
+
diff --git a/lapack-netlib/SRC/sgelst.c b/lapack-netlib/SRC/sgelst.c
new file mode 100644
index 000000000..e0cd84cd9
--- /dev/null
+++ b/lapack-netlib/SRC/sgelst.c
@@ -0,0 +1,1099 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static real c_b12 = 0.f;
+static integer c__2 = 2;
+static integer c__0 = 0;
+
+/* > \brief <b> SGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori
+zation with compact WY representation of Q.</b> */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download SGELST + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgelst.
+f"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgelst.
+f"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgelst.
+f"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE SGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       CHARACTER          TRANS */
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       REAL               A( LDA, * ), B( LDB, * ), WORK( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > SGELST solves overdetermined or underdetermined real linear systems */
+/* > involving an M-by-N matrix A, or its transpose, using a QR or LQ */
+/* > factorization of A with compact WY representation of Q. */
+/* > It is assumed that A has full rank. */
+/* > */
+/* > The following options are provided: */
+/* > */
+/* > 1. If TRANS = 'N' and m >= n:  find the least squares solution of */
+/* >    an overdetermined system, i.e., solve the least squares problem */
+/* >                 minimize || B - A*X ||. */
+/* > */
+/* > 2. If TRANS = 'N' and m < n:  find the minimum norm solution of */
+/* >    an underdetermined system A * X = B. */
+/* > */
+/* > 3. If TRANS = 'T' and m >= n:  find the minimum norm solution of */
+/* >    an underdetermined system A**T * X = B. */
+/* > */
+/* > 4. If TRANS = 'T' and m < n:  find the least squares solution of */
+/* >    an overdetermined system, i.e., solve the least squares problem */
+/* >                 minimize || B - A**T * X ||. */
+/* > */
+/* > Several right hand side vectors b and solution vectors x can be */
+/* > handled in a single call; they are stored as the columns of the */
+/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */
+/* > matrix X. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] TRANS */
+/* > \verbatim */
+/* >          TRANS is CHARACTER*1 */
+/* >          = 'N': the linear system involves A; */
+/* >          = 'T': the linear system involves A**T. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of right hand sides, i.e., the number of */
+/* >          columns of the matrices B and X. NRHS >=0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] A */
+/* > \verbatim */
+/* >          A is REAL array, dimension (LDA,N) */
+/* >          On entry, the M-by-N matrix A. */
+/* >          On exit, */
+/* >            if M >= N, A is overwritten by details of its QR */
+/* >                       factorization as returned by SGEQRT; */
+/* >            if M <  N, A is overwritten by details of its LQ */
+/* >                       factorization as returned by SGELQT. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= f2cmax(1,M). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is REAL array, dimension (LDB,NRHS) */
+/* >          On entry, the matrix B of right hand side vectors, stored */
+/* >          columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */
+/* >          if TRANS = 'T'. */
+/* >          On exit, if INFO = 0, B is overwritten by the solution */
+/* >          vectors, stored columnwise: */
+/* >          if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */
+/* >          squares solution vectors; the residual sum of squares for the */
+/* >          solution in each column is given by the sum of squares of */
+/* >          elements N+1 to M in that column; */
+/* >          if TRANS = 'N' and m < n, rows 1 to N of B contain the */
+/* >          minimum norm solution vectors; */
+/* >          if TRANS = 'T' and m >= n, rows 1 to M of B contain the */
+/* >          minimum norm solution vectors; */
+/* >          if TRANS = 'T' and m < n, rows 1 to M of B contain the */
+/* >          least squares solution vectors; the residual sum of squares */
+/* >          for the solution in each column is given by the sum of */
+/* >          squares of elements M+1 to N in that column. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= MAX(1,M,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is REAL array, dimension (MAX(1,LWORK)) */
+/* >          On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The dimension of the array WORK. */
+/* >          LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */
+/* >          For optimal performance, */
+/* >          LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */
+/* >          where MN = f2cmin(M,N) and NB is the optimum block size. */
+/* > */
+/* >          If LWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal size of the WORK array, returns */
+/* >          this value as the first entry of the WORK array, and no error */
+/* >          message related to LWORK is issued by XERBLA. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0:  successful exit */
+/* >          < 0:  if INFO = -i, the i-th argument had an illegal value */
+/* >          > 0:  if INFO =  i, the i-th diagonal element of the */
+/* >                triangular factor of A is zero, so that A does not have */
+/* >                full rank; the least squares solution could not be */
+/* >                computed. */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup realGEsolve */
+
+/* > \par Contributors: */
+/*  ================== */
+/* > */
+/* > \verbatim */
+/* > */
+/* >  November 2022,  Igor Kozachenko, */
+/* >                  Computer Science Division, */
+/* >                  University of California, Berkeley */
+/* > \endverbatim */
+
+/*  ===================================================================== */
+/* Subroutine */ int sgelst_(char *trans, integer *m, integer *n, integer *
+	nrhs, real *a, integer *lda, real *b, integer *ldb, real *work, 
+	integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;
+
+    /* Local variables */
+    real anrm, bnrm;
+    integer brow;
+    logical tpsd;
+    integer i__, j, iascl, ibscl;
+    extern logical lsame_(char *, char *);
+    integer nbmin;
+    real rwork[1];
+    integer lwopt, nb;
+    extern /* Subroutine */ int slabad_(real *, real *);
+    integer mn;
+    extern real slamch_(char *), slange_(char *, integer *, integer *,
+	     real *, integer *, real *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    integer scllen;
+    real bignum;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, 
+	    real *, integer *, integer *, real *, integer *, integer *), slaset_(char *, integer *, integer *, real *, real *, 
+	    real *, integer *), sgelqt_(integer *, integer *, integer 
+	    *, real *, integer *, real *, integer *, real *, integer *);
+    integer mnnrhs;
+    extern /* Subroutine */ int sgeqrt_(integer *, integer *, integer *, real 
+	    *, integer *, real *, integer *, real *, integer *);
+    real smlnum;
+    logical lquery;
+    extern /* Subroutine */ int strtrs_(char *, char *, char *, integer *, 
+	    integer *, real *, integer *, real *, integer *, integer *), sgemlqt_(char *, char *, integer *, 
+	    integer *, integer *, integer *, real *, integer *, real *, 
+	    integer *, real *, integer *, real *, integer *), 
+	    sgemqrt_(char *, char *, integer *, integer *, integer *, integer 
+	    *, real *, integer *, real *, integer *, real *, integer *, real *
+	    , integer *);
+
+
+/*  -- LAPACK driver routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    mn = f2cmin(*m,*n);
+    lquery = *lwork == -1;
+    if (! (lsame_(trans, "N") || lsame_(trans, "T"))) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 0) {
+	*info = -4;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = f2cmax(1,*m);
+	if (*ldb < f2cmax(i__1,*n)) {
+	    *info = -8;
+	} else /* if(complicated condition) */ {
+/* Computing MAX */
+	    i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs);
+	    if (*lwork < f2cmax(i__1,i__2) && ! lquery) {
+		*info = -10;
+	    }
+	}
+    }
+
+/*     Figure out optimal block size and optimal workspace size */
+
+    if (*info == 0 || *info == -10) {
+
+	tpsd = TRUE_;
+	if (lsame_(trans, "N")) {
+	    tpsd = FALSE_;
+	}
+
+	nb = ilaenv_(&c__1, "SGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (
+		ftnlen)1);
+
+	mnnrhs = f2cmax(mn,*nrhs);
+/* Computing MAX */
+	i__1 = 1, i__2 = (mn + mnnrhs) * nb;
+	lwopt = f2cmax(i__1,i__2);
+	work[1] = (real) lwopt;
+
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGELST ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+/* Computing MIN */
+    i__1 = f2cmin(*m,*n);
+    if (f2cmin(i__1,*nrhs) == 0) {
+	i__1 = f2cmax(*m,*n);
+	slaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb);
+	work[1] = (real) lwopt;
+	return 0;
+    }
+
+/*     *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */
+
+    if (nb > mn) {
+	nb = mn;
+    }
+
+/*     Determine the block size from the supplied LWORK */
+/*     ( at this stage we know that LWORK >= (minimum required workspace, */
+/*     but it may be less than optimal) */
+
+/* Computing MIN */
+    i__1 = nb, i__2 = *lwork / (mn + mnnrhs);
+    nb = f2cmin(i__1,i__2);
+
+/*     The minimum value of NB, when blocked code is used */
+
+/* Computing MAX */
+    i__1 = 2, i__2 = ilaenv_(&c__2, "SGELST", " ", m, n, &c_n1, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nbmin = f2cmax(i__1,i__2);
+
+    if (nb < nbmin) {
+	nb = 1;
+    }
+
+/*     Get machine parameters */
+
+    smlnum = slamch_("S") / slamch_("P");
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
+
+/*     Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */
+
+    anrm = slange_("M", m, n, &a[a_offset], lda, rwork);
+    iascl = 0;
+    if (anrm > 0.f && anrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	slascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, 
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM */
+
+	slascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, 
+		info);
+	iascl = 2;
+    } else if (anrm == 0.f) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = f2cmax(*m,*n);
+	slaset_("Full", &i__1, nrhs, &c_b12, &c_b12, &b[b_offset], ldb);
+	work[1] = (real) lwopt;
+	return 0;
+    }
+
+    brow = *m;
+    if (tpsd) {
+	brow = *n;
+    }
+    bnrm = slange_("M", &brow, nrhs, &b[b_offset], ldb, rwork);
+    ibscl = 0;
+    if (bnrm > 0.f && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	slascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], 
+		ldb, info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM */
+
+	slascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], 
+		ldb, info);
+	ibscl = 2;
+    }
+
+    if (*m >= *n) {
+
+/*        M > N: */
+/*        Compute the blocked QR factorization of A, */
+/*        using the compact WY representation of Q, */
+/*        workspace at least N, optimally N*NB. */
+
+	sgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + 
+		1], info);
+
+	if (! tpsd) {
+
+/*           M > N, A is not transposed: */
+/*           Overdetermined system of equations, */
+/*           least-squares problem, f2cmin || A * X - B ||. */
+
+/*           Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    sgemqrt_("Left", "Transpose", m, nrhs, n, &nb, &a[a_offset], lda, 
+		    &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], 
+		    info);
+
+/*           Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */
+
+	    strtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset]
+		    , lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+	    scllen = *n;
+
+	} else {
+
+/*           M > N, A is transposed: */
+/*           Underdetermined system of equations, */
+/*           minimum norm solution of A**T * X = B. */
+
+/*           Compute B := inv(R**T) * B in two row blocks of B. */
+
+/*           Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */
+
+	    strtrs_("Upper", "Transpose", "Non-unit", n, nrhs, &a[a_offset], 
+		    lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+/*           Block 2: Zero out all rows below the N-th row in B: */
+/*           B(N+1:M,1:NRHS) = ZERO */
+
+	    i__1 = *nrhs;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = *n + 1; i__ <= i__2; ++i__) {
+		    b[i__ + j * b_dim1] = 0.f;
+		}
+	    }
+
+/*           Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    sgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], 
+		    lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1],
+		     info);
+
+	    scllen = *m;
+
+	}
+
+    } else {
+
+/*        M < N: */
+/*        Compute the blocked LQ factorization of A, */
+/*        using the compact WY representation of Q, */
+/*        workspace at least M, optimally M*NB. */
+
+	sgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + 
+		1], info);
+
+	if (! tpsd) {
+
+/*           M < N, A is not transposed: */
+/*           Underdetermined system of equations, */
+/*           minimum norm solution of A * X = B. */
+
+/*           Compute B := inv(L) * B in two row blocks of B. */
+
+/*           Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */
+
+	    strtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset]
+		    , lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+/*           Block 2: Zero out all rows below the M-th row in B: */
+/*           B(M+1:N,1:NRHS) = ZERO */
+
+	    i__1 = *nrhs;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		    b[i__ + j * b_dim1] = 0.f;
+		}
+	    }
+
+/*           Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    sgemlqt_("Left", "Transpose", n, nrhs, m, &nb, &a[a_offset], lda, 
+		    &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1], 
+		    info);
+
+	    scllen = *n;
+
+	} else {
+
+/*           M < N, A is transposed: */
+/*           Overdetermined system of equations, */
+/*           least-squares problem, f2cmin || A**T * X - B ||. */
+
+/*           Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    sgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], 
+		    lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1],
+		     info);
+
+/*           Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */
+
+	    strtrs_("Lower", "Transpose", "Non-unit", m, nrhs, &a[a_offset], 
+		    lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+	    scllen = *m;
+
+	}
+
+    }
+
+/*     Undo scaling */
+
+    if (iascl == 1) {
+	slascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    } else if (iascl == 2) {
+	slascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    }
+    if (ibscl == 1) {
+	slascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    } else if (ibscl == 2) {
+	slascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    }
+
+    work[1] = (real) lwopt;
+
+    return 0;
+
+/*     End of SGELST */
+
+} /* sgelst_ */
+
diff --git a/lapack-netlib/SRC/zgelst.c b/lapack-netlib/SRC/zgelst.c
new file mode 100644
index 000000000..447cd30bb
--- /dev/null
+++ b/lapack-netlib/SRC/zgelst.c
@@ -0,0 +1,1115 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {0.,0.};
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__2 = 2;
+static integer c__0 = 0;
+
+/* > \brief <b> ZGELST solves overdetermined or underdetermined systems for GE matrices using QR or LQ factori
+zation with compact WY representation of Q.</b> */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download ZGELST + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgelst.
+f"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgelst.
+f"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgelst.
+f"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZGELST( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       CHARACTER          TRANS */
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       COMPLEX*16         A( LDA, * ), B( LDB, * ), WORK( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > ZGELST solves overdetermined or underdetermined real linear systems */
+/* > involving an M-by-N matrix A, or its conjugate-transpose, using a QR */
+/* > or LQ factorization of A with compact WY representation of Q. */
+/* > It is assumed that A has full rank. */
+/* > */
+/* > The following options are provided: */
+/* > */
+/* > 1. If TRANS = 'N' and m >= n:  find the least squares solution of */
+/* >    an overdetermined system, i.e., solve the least squares problem */
+/* >                 minimize || B - A*X ||. */
+/* > */
+/* > 2. If TRANS = 'N' and m < n:  find the minimum norm solution of */
+/* >    an underdetermined system A * X = B. */
+/* > */
+/* > 3. If TRANS = 'C' and m >= n:  find the minimum norm solution of */
+/* >    an underdetermined system A**T * X = B. */
+/* > */
+/* > 4. If TRANS = 'C' and m < n:  find the least squares solution of */
+/* >    an overdetermined system, i.e., solve the least squares problem */
+/* >                 minimize || B - A**T * X ||. */
+/* > */
+/* > Several right hand side vectors b and solution vectors x can be */
+/* > handled in a single call; they are stored as the columns of the */
+/* > M-by-NRHS right hand side matrix B and the N-by-NRHS solution */
+/* > matrix X. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] TRANS */
+/* > \verbatim */
+/* >          TRANS is CHARACTER*1 */
+/* >          = 'N': the linear system involves A; */
+/* >          = 'C': the linear system involves A**H. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of right hand sides, i.e., the number of */
+/* >          columns of the matrices B and X. NRHS >=0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 array, dimension (LDA,N) */
+/* >          On entry, the M-by-N matrix A. */
+/* >          On exit, */
+/* >            if M >= N, A is overwritten by details of its QR */
+/* >                       factorization as returned by ZGEQRT; */
+/* >            if M <  N, A is overwritten by details of its LQ */
+/* >                       factorization as returned by ZGELQT. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= f2cmax(1,M). */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is COMPLEX*16 array, dimension (LDB,NRHS) */
+/* >          On entry, the matrix B of right hand side vectors, stored */
+/* >          columnwise; B is M-by-NRHS if TRANS = 'N', or N-by-NRHS */
+/* >          if TRANS = 'C'. */
+/* >          On exit, if INFO = 0, B is overwritten by the solution */
+/* >          vectors, stored columnwise: */
+/* >          if TRANS = 'N' and m >= n, rows 1 to n of B contain the least */
+/* >          squares solution vectors; the residual sum of squares for the */
+/* >          solution in each column is given by the sum of squares of */
+/* >          modulus of elements N+1 to M in that column; */
+/* >          if TRANS = 'N' and m < n, rows 1 to N of B contain the */
+/* >          minimum norm solution vectors; */
+/* >          if TRANS = 'C' and m >= n, rows 1 to M of B contain the */
+/* >          minimum norm solution vectors; */
+/* >          if TRANS = 'C' and m < n, rows 1 to M of B contain the */
+/* >          least squares solution vectors; the residual sum of squares */
+/* >          for the solution in each column is given by the sum of */
+/* >          squares of the modulus of elements M+1 to N in that column. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= MAX(1,M,N). */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)) */
+/* >          On exit, if INFO = 0, WORK(1) returns the optimal LWORK. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The dimension of the array WORK. */
+/* >          LWORK >= f2cmax( 1, MN + f2cmax( MN, NRHS ) ). */
+/* >          For optimal performance, */
+/* >          LWORK >= f2cmax( 1, (MN + f2cmax( MN, NRHS ))*NB ). */
+/* >          where MN = f2cmin(M,N) and NB is the optimum block size. */
+/* > */
+/* >          If LWORK = -1, then a workspace query is assumed; the routine */
+/* >          only calculates the optimal size of the WORK array, returns */
+/* >          this value as the first entry of the WORK array, and no error */
+/* >          message related to LWORK is issued by XERBLA. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0:  successful exit */
+/* >          < 0:  if INFO = -i, the i-th argument had an illegal value */
+/* >          > 0:  if INFO =  i, the i-th diagonal element of the */
+/* >                triangular factor of A is zero, so that A does not have */
+/* >                full rank; the least squares solution could not be */
+/* >                computed. */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16GEsolve */
+
+/* > \par Contributors: */
+/*  ================== */
+/* > */
+/* > \verbatim */
+/* > */
+/* >  November 2022,  Igor Kozachenko, */
+/* >                  Computer Science Division, */
+/* >                  University of California, Berkeley */
+/* > \endverbatim */
+
+/*  ===================================================================== */
+/* Subroutine */ int zgelst_(char *trans, integer *m, integer *n, integer *
+	nrhs, doublecomplex *a, integer *lda, doublecomplex *b, integer *ldb, 
+	doublecomplex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3;
+    doublereal d__1;
+
+    /* Local variables */
+    doublereal anrm, bnrm;
+    integer brow;
+    logical tpsd;
+    integer i__, j, iascl, ibscl;
+    extern logical lsame_(char *, char *);
+    integer nbmin;
+    doublereal rwork[1];
+    integer lwopt;
+    extern /* Subroutine */ int dlabad_(doublereal *, doublereal *);
+    integer nb;
+    extern doublereal dlamch_(char *);
+    integer mn;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    integer scllen;
+    doublereal bignum;
+    extern doublereal zlange_(char *, integer *, integer *, doublecomplex *, 
+	    integer *, doublereal *);
+    extern /* Subroutine */ int zlascl_(char *, integer *, integer *, 
+	    doublereal *, doublereal *, integer *, integer *, doublecomplex *,
+	     integer *, integer *), zlaset_(char *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    integer mnnrhs;
+    extern /* Subroutine */ int zgelqt_(integer *, integer *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *);
+    doublereal smlnum;
+    extern /* Subroutine */ int zgeqrt_(integer *, integer *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *);
+    logical lquery;
+    extern /* Subroutine */ int ztrtrs_(char *, char *, char *, integer *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     integer *), zgemlqt_(char *, char *, 
+	    integer *, integer *, integer *, integer *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublecomplex *, integer *), zgemqrt_(char *, 
+	    char *, integer *, integer *, integer *, integer *, doublecomplex 
+	    *, integer *, doublecomplex *, integer *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *);
+
+
+/*  -- LAPACK driver routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    mn = f2cmin(*m,*n);
+    lquery = *lwork == -1;
+    if (! (lsame_(trans, "N") || lsame_(trans, "C"))) {
+	*info = -1;
+    } else if (*m < 0) {
+	*info = -2;
+    } else if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 0) {
+	*info = -4;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -6;
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = f2cmax(1,*m);
+	if (*ldb < f2cmax(i__1,*n)) {
+	    *info = -8;
+	} else /* if(complicated condition) */ {
+/* Computing MAX */
+	    i__1 = 1, i__2 = mn + f2cmax(mn,*nrhs);
+	    if (*lwork < f2cmax(i__1,i__2) && ! lquery) {
+		*info = -10;
+	    }
+	}
+    }
+
+/*     Figure out optimal block size and optimal workspace size */
+
+    if (*info == 0 || *info == -10) {
+
+	tpsd = TRUE_;
+	if (lsame_(trans, "N")) {
+	    tpsd = FALSE_;
+	}
+
+	nb = ilaenv_(&c__1, "ZGELST", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (
+		ftnlen)1);
+
+	mnnrhs = f2cmax(mn,*nrhs);
+/* Computing MAX */
+	i__1 = 1, i__2 = (mn + mnnrhs) * nb;
+	lwopt = f2cmax(i__1,i__2);
+	d__1 = (doublereal) lwopt;
+	work[1].r = d__1, work[1].i = 0.;
+
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGELST ", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+/* Computing MIN */
+    i__1 = f2cmin(*m,*n);
+    if (f2cmin(i__1,*nrhs) == 0) {
+	i__1 = f2cmax(*m,*n);
+	zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb);
+	d__1 = (doublereal) lwopt;
+	work[1].r = d__1, work[1].i = 0.;
+	return 0;
+    }
+
+/*     *GEQRT and *GELQT routines cannot accept NB larger than f2cmin(M,N) */
+
+    if (nb > mn) {
+	nb = mn;
+    }
+
+/*     Determine the block size from the supplied LWORK */
+/*     ( at this stage we know that LWORK >= (minimum required workspace, */
+/*     but it may be less than optimal) */
+
+/* Computing MIN */
+    i__1 = nb, i__2 = *lwork / (mn + mnnrhs);
+    nb = f2cmin(i__1,i__2);
+
+/*     The minimum value of NB, when blocked code is used */
+
+/* Computing MAX */
+    i__1 = 2, i__2 = ilaenv_(&c__2, "ZGELST", " ", m, n, &c_n1, &c_n1, (
+	    ftnlen)6, (ftnlen)1);
+    nbmin = f2cmax(i__1,i__2);
+
+    if (nb < nbmin) {
+	nb = 1;
+    }
+
+/*     Get machine parameters */
+
+    smlnum = dlamch_("S") / dlamch_("P");
+    bignum = 1. / smlnum;
+    dlabad_(&smlnum, &bignum);
+
+/*     Scale A, B if f2cmax element outside range [SMLNUM,BIGNUM] */
+
+    anrm = zlange_("M", m, n, &a[a_offset], lda, rwork);
+    iascl = 0;
+    if (anrm > 0. && anrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	zlascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, 
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM */
+
+	zlascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, 
+		info);
+	iascl = 2;
+    } else if (anrm == 0.) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = f2cmax(*m,*n);
+	zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[b_offset], ldb);
+	d__1 = (doublereal) lwopt;
+	work[1].r = d__1, work[1].i = 0.;
+	return 0;
+    }
+
+    brow = *m;
+    if (tpsd) {
+	brow = *n;
+    }
+    bnrm = zlange_("M", &brow, nrhs, &b[b_offset], ldb, rwork);
+    ibscl = 0;
+    if (bnrm > 0. && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM */
+
+	zlascl_("G", &c__0, &c__0, &bnrm, &smlnum, &brow, nrhs, &b[b_offset], 
+		ldb, info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM */
+
+	zlascl_("G", &c__0, &c__0, &bnrm, &bignum, &brow, nrhs, &b[b_offset], 
+		ldb, info);
+	ibscl = 2;
+    }
+
+    if (*m >= *n) {
+
+/*        M > N: */
+/*        Compute the blocked QR factorization of A, */
+/*        using the compact WY representation of Q, */
+/*        workspace at least N, optimally N*NB. */
+
+	zgeqrt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + 
+		1], info);
+
+	if (! tpsd) {
+
+/*           M > N, A is not transposed: */
+/*           Overdetermined system of equations, */
+/*           least-squares problem, f2cmin || A * X - B ||. */
+
+/*           Compute B(1:M,1:NRHS) := Q**T * B(1:M,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    zgemqrt_("Left", "Conjugate transpose", m, nrhs, n, &nb, &a[
+		    a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[
+		    mn * nb + 1], info);
+
+/*           Compute B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS) */
+
+	    ztrtrs_("Upper", "No transpose", "Non-unit", n, nrhs, &a[a_offset]
+		    , lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+	    scllen = *n;
+
+	} else {
+
+/*           M > N, A is transposed: */
+/*           Underdetermined system of equations, */
+/*           minimum norm solution of A**T * X = B. */
+
+/*           Compute B := inv(R**T) * B in two row blocks of B. */
+
+/*           Block 1: B(1:N,1:NRHS) := inv(R**T) * B(1:N,1:NRHS) */
+
+	    ztrtrs_("Upper", "Conjugate transpose", "Non-unit", n, nrhs, &a[
+		    a_offset], lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+/*           Block 2: Zero out all rows below the N-th row in B: */
+/*           B(N+1:M,1:NRHS) = ZERO */
+
+	    i__1 = *nrhs;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *m;
+		for (i__ = *n + 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * b_dim1;
+		    b[i__3].r = 0., b[i__3].i = 0.;
+		}
+	    }
+
+/*           Compute B(1:M,1:NRHS) := Q(1:N,:) * B(1:N,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    zgemqrt_("Left", "No transpose", m, nrhs, n, &nb, &a[a_offset], 
+		    lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1],
+		     info);
+
+	    scllen = *m;
+
+	}
+
+    } else {
+
+/*        M < N: */
+/*        Compute the blocked LQ factorization of A, */
+/*        using the compact WY representation of Q, */
+/*        workspace at least M, optimally M*NB. */
+
+	zgelqt_(m, n, &nb, &a[a_offset], lda, &work[1], &nb, &work[mn * nb + 
+		1], info);
+
+	if (! tpsd) {
+
+/*           M < N, A is not transposed: */
+/*           Underdetermined system of equations, */
+/*           minimum norm solution of A * X = B. */
+
+/*           Compute B := inv(L) * B in two row blocks of B. */
+
+/*           Block 1: B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS) */
+
+	    ztrtrs_("Lower", "No transpose", "Non-unit", m, nrhs, &a[a_offset]
+		    , lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+/*           Block 2: Zero out all rows below the M-th row in B: */
+/*           B(M+1:N,1:NRHS) = ZERO */
+
+	    i__1 = *nrhs;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = *n;
+		for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		    i__3 = i__ + j * b_dim1;
+		    b[i__3].r = 0., b[i__3].i = 0.;
+		}
+	    }
+
+/*           Compute B(1:N,1:NRHS) := Q(1:N,:)**T * B(1:M,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    zgemlqt_("Left", "Conjugate transpose", n, nrhs, m, &nb, &a[
+		    a_offset], lda, &work[1], &nb, &b[b_offset], ldb, &work[
+		    mn * nb + 1], info);
+
+	    scllen = *n;
+
+	} else {
+
+/*           M < N, A is transposed: */
+/*           Overdetermined system of equations, */
+/*           least-squares problem, f2cmin || A**T * X - B ||. */
+
+/*           Compute B(1:N,1:NRHS) := Q * B(1:N,1:NRHS), */
+/*           using the compact WY representation of Q, */
+/*           workspace at least NRHS, optimally NRHS*NB. */
+
+	    zgemlqt_("Left", "No transpose", n, nrhs, m, &nb, &a[a_offset], 
+		    lda, &work[1], &nb, &b[b_offset], ldb, &work[mn * nb + 1],
+		     info);
+
+/*           Compute B(1:M,1:NRHS) := inv(L**T) * B(1:M,1:NRHS) */
+
+	    ztrtrs_("Lower", "Conjugate transpose", "Non-unit", m, nrhs, &a[
+		    a_offset], lda, &b[b_offset], ldb, info);
+
+	    if (*info > 0) {
+		return 0;
+	    }
+
+	    scllen = *m;
+
+	}
+
+    }
+
+/*     Undo scaling */
+
+    if (iascl == 1) {
+	zlascl_("G", &c__0, &c__0, &anrm, &smlnum, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    } else if (iascl == 2) {
+	zlascl_("G", &c__0, &c__0, &anrm, &bignum, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    }
+    if (ibscl == 1) {
+	zlascl_("G", &c__0, &c__0, &smlnum, &bnrm, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    } else if (ibscl == 2) {
+	zlascl_("G", &c__0, &c__0, &bignum, &bnrm, &scllen, nrhs, &b[b_offset]
+		, ldb, info);
+    }
+
+    d__1 = (doublereal) lwopt;
+    work[1].r = d__1, work[1].i = 0.;
+
+    return 0;
+
+/*     End of ZGELST */
+
+} /* zgelst_ */
+

From 88cd91c4902cd40978420df9db123b392d771ad7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 19 Nov 2022 23:15:20 +0100
Subject: [PATCH 119/154] Fix stray character

---
 cmake/lapack.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 8a5ff22ec..17ff8d0a0 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -638,7 +638,7 @@ set(CLASRC
    cgecon.c cgeequ.c cgees.c  cgeesx.c cgeev.c  cgeevx.c
    cgehd2.c cgehrd.c cgelq2.c cgelqf.c
    cgels.c  cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
-   cgeqr2.c cgeqr2p.c cgeqrf.c fcgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
+   cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
    cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
    cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
    cgesvx.c cgetc2.c cgetrf2.c

From eea1636380fe6b8462e2ae73cc0e6c3c1aa0e3ce Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 13:22:55 +0100
Subject: [PATCH 120/154] Use normwise criterion for INF eigenvalues in QZ
 (Reference-LAPACK PR698)

---
 lapack-netlib/SRC/chgeqz.f | 9 ++-------
 lapack-netlib/SRC/dhgeqz.f | 9 ++-------
 lapack-netlib/SRC/shgeqz.f | 9 ++-------
 lapack-netlib/SRC/zhgeqz.f | 9 ++-------
 4 files changed, 8 insertions(+), 28 deletions(-)

diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f
index 8c1d62a87..50c6827ff 100644
--- a/lapack-netlib/SRC/chgeqz.f
+++ b/lapack-netlib/SRC/chgeqz.f
@@ -523,9 +523,7 @@
             END IF
          END IF
 *
-         IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( 
-     $         ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 )
-     $          ) ) ) ) THEN
+         IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN
             T( ILAST, ILAST ) = CZERO
             GO TO 50
          END IF
@@ -551,10 +549,7 @@
 *
 *           Test 2: for T(j,j)=0
 *
-            TEMP = ABS ( T( J, J + 1 ) )
-            IF ( J .GT. ILO )
-     $           TEMP = TEMP + ABS ( T( J - 1, J ) )
-            IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN
+            IF( ABS( T( J, J ) ).LT.BTOL ) THEN
                T( J, J ) = CZERO
 *
 *              Test 1a: Check for 2 consecutive small subdiagonals in A
diff --git a/lapack-netlib/SRC/dhgeqz.f b/lapack-netlib/SRC/dhgeqz.f
index 3fe2a083c..b5a2917e3 100644
--- a/lapack-netlib/SRC/dhgeqz.f
+++ b/lapack-netlib/SRC/dhgeqz.f
@@ -536,9 +536,7 @@
             END IF
          END IF
 *
-         IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( 
-     $         ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 )
-     $          ) ) ) ) THEN
+         IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN
             T( ILAST, ILAST ) = ZERO
             GO TO 70
          END IF
@@ -564,10 +562,7 @@
 *
 *           Test 2: for T(j,j)=0
 *
-            TEMP = ABS ( T( J, J + 1 ) )
-            IF ( J .GT. ILO )
-     $           TEMP = TEMP + ABS ( T( J - 1, J ) )
-            IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN
+            IF( ABS( T( J, J ) ).LT.BTOL ) THEN
                T( J, J ) = ZERO
 *
 *              Test 1a: Check for 2 consecutive small subdiagonals in A
diff --git a/lapack-netlib/SRC/shgeqz.f b/lapack-netlib/SRC/shgeqz.f
index 79a9c6092..10fb2b7d7 100644
--- a/lapack-netlib/SRC/shgeqz.f
+++ b/lapack-netlib/SRC/shgeqz.f
@@ -536,9 +536,7 @@
             END IF
          END IF
 *
-         IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( 
-     $         ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 )
-     $          ) ) ) ) THEN
+         IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN
             T( ILAST, ILAST ) = ZERO
             GO TO 70
          END IF
@@ -564,10 +562,7 @@
 *
 *           Test 2: for T(j,j)=0
 *
-            TEMP = ABS ( T( J, J + 1 ) )
-            IF ( J .GT. ILO )
-     $           TEMP = TEMP + ABS ( T( J - 1, J ) )
-            IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN
+            IF( ABS( T( J, J ) ).LT.BTOL ) THEN
                T( J, J ) = ZERO
 *
 *              Test 1a: Check for 2 consecutive small subdiagonals in A
diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f
index 302b69f34..c15e7aace 100644
--- a/lapack-netlib/SRC/zhgeqz.f
+++ b/lapack-netlib/SRC/zhgeqz.f
@@ -524,9 +524,7 @@
             END IF
          END IF
 *
-         IF( ABS( T( ILAST, ILAST ) ).LE.MAX( SAFMIN, ULP*( 
-     $         ABS( T( ILAST - 1, ILAST ) ) + ABS( T( ILAST-1, ILAST-1 )
-     $          ) ) ) ) THEN
+         IF( ABS( T( ILAST, ILAST ) ).LE.BTOL ) THEN
             T( ILAST, ILAST ) = CZERO
             GO TO 50
          END IF
@@ -552,10 +550,7 @@
 *
 *           Test 2: for T(j,j)=0
 *
-            TEMP = ABS ( T( J, J + 1 ) )
-            IF ( J .GT. ILO )
-     $           TEMP = TEMP + ABS ( T( J - 1, J ) )
-            IF( ABS( T( J, J ) ).LT.MAX( SAFMIN,ULP*TEMP ) ) THEN
+            IF( ABS( T( J, J ) ).LT.BTOL ) THEN
                T( J, J ) = CZERO
 *
 *              Test 1a: Check for 2 consecutive small subdiagonals in A

From 60af35bfab111416f78db3a7797f2134c7f23ea0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 13:25:21 +0100
Subject: [PATCH 121/154] Fix workspace query for ?SYEVD and ?HEEVD
 (Reference-LAPACK PR691)

---
 lapack-netlib/SRC/cheevd.f | 2 +-
 lapack-netlib/SRC/dsyevd.f | 2 +-
 lapack-netlib/SRC/ssyevd.f | 2 +-
 lapack-netlib/SRC/zheevd.f | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f
index 9a4a1efb7..2ddf74b98 100644
--- a/lapack-netlib/SRC/cheevd.f
+++ b/lapack-netlib/SRC/cheevd.f
@@ -284,7 +284,7 @@
                LIWMIN = 1
             END IF
             LOPT = MAX( LWMIN, N +
-     $                  ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) )
+     $                  N*ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) )
             LROPT = LRWMIN
             LIOPT = LIWMIN
          END IF
diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f
index edbe896fe..eaaecd8d9 100644
--- a/lapack-netlib/SRC/dsyevd.f
+++ b/lapack-netlib/SRC/dsyevd.f
@@ -257,7 +257,7 @@
                LWMIN = 2*N + 1
             END IF
             LOPT = MAX( LWMIN, 2*N +
-     $                  ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) )
+     $                  N*ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 ) )
             LIOPT = LIWMIN
          END IF
          WORK( 1 ) = LOPT
diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f
index 8b90d9263..ac0d0284d 100644
--- a/lapack-netlib/SRC/ssyevd.f
+++ b/lapack-netlib/SRC/ssyevd.f
@@ -255,7 +255,7 @@
                LWMIN = 2*N + 1
             END IF
             LOPT = MAX( LWMIN, 2*N +
-     $                  ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) )
+     $                  N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) )
             LIOPT = LIWMIN
          END IF
          WORK( 1 ) = LOPT
diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f
index a6484eb03..7f58c7f72 100644
--- a/lapack-netlib/SRC/zheevd.f
+++ b/lapack-netlib/SRC/zheevd.f
@@ -284,7 +284,7 @@
                LIWMIN = 1
             END IF
             LOPT = MAX( LWMIN, N +
-     $                  ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) )
+     $                  N*ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) )
             LROPT = LRWMIN
             LIOPT = LIWMIN
          END IF

From 3f31b691211a772b61c0e016961e9c0d8f05a02e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 13:30:25 +0100
Subject: [PATCH 122/154] Add quick return if scaling with one
 (Reference-LAPACK PR674)

---
 lapack-netlib/SRC/clascl.f | 2 ++
 lapack-netlib/SRC/dlascl.f | 2 ++
 lapack-netlib/SRC/slascl.f | 2 ++
 lapack-netlib/SRC/zlascl.f | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/lapack-netlib/SRC/clascl.f b/lapack-netlib/SRC/clascl.f
index 399af23a4..f9aace0bc 100644
--- a/lapack-netlib/SRC/clascl.f
+++ b/lapack-netlib/SRC/clascl.f
@@ -272,6 +272,8 @@
          ELSE
             MUL = CTOC / CFROMC
             DONE = .TRUE.
+            IF (MUL .EQ. ONE)
+     $         RETURN
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/dlascl.f b/lapack-netlib/SRC/dlascl.f
index 05ad1c4f3..0a4bf21ce 100644
--- a/lapack-netlib/SRC/dlascl.f
+++ b/lapack-netlib/SRC/dlascl.f
@@ -272,6 +272,8 @@
          ELSE
             MUL = CTOC / CFROMC
             DONE = .TRUE.
+            IF (MUL .EQ. ONE)
+     $         RETURN
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/slascl.f b/lapack-netlib/SRC/slascl.f
index e1cb420ea..28cbd6514 100644
--- a/lapack-netlib/SRC/slascl.f
+++ b/lapack-netlib/SRC/slascl.f
@@ -272,6 +272,8 @@
          ELSE
             MUL = CTOC / CFROMC
             DONE = .TRUE.
+            IF (MUL .EQ. ONE)
+     $         RETURN
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/zlascl.f b/lapack-netlib/SRC/zlascl.f
index 3d53f5ae6..4cce5ff5e 100644
--- a/lapack-netlib/SRC/zlascl.f
+++ b/lapack-netlib/SRC/zlascl.f
@@ -272,6 +272,8 @@
          ELSE
             MUL = CTOC / CFROMC
             DONE = .TRUE.
+            IF (MUL .EQ. ONE)
+     $         RETURN
          END IF
       END IF
 *

From 9e29312c8311efb22998029b330c3cba4c04b5da Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 16:34:45 +0100
Subject: [PATCH 123/154] Fix type precision and function documentation
 (Reference-LAPACK PRs 647+702)

---
 lapack-netlib/SRC/dorbdb6.f | 84 +++++++++++++++++++++----------------
 lapack-netlib/SRC/zunbdb6.f | 84 +++++++++++++++++++++----------------
 2 files changed, 94 insertions(+), 74 deletions(-)

diff --git a/lapack-netlib/SRC/dorbdb6.f b/lapack-netlib/SRC/dorbdb6.f
index fac52f760..45c8ba8a2 100644
--- a/lapack-netlib/SRC/dorbdb6.f
+++ b/lapack-netlib/SRC/dorbdb6.f
@@ -41,10 +41,16 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The columns of Q must be orthonormal.
+*> The Euclidean norm of X must be one and the columns of Q must be
+*> orthonormal. The orthogonalized vector will be zero if and only if it
+*> lies entirely in the range of Q.
 *>
-*> If the projection is zero according to Kahan's "twice is enough"
-*> criterion, then the zero vector is returned.
+*> The projection is computed with at most two iterations of the
+*> classical Gram-Schmidt algorithm, see
+*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error
+*>   analysis of the Gram-Schmidt algorithm with reorthogonalization."
+*>   2002. CERFACS Technical Report No. TR/PA/02/33. URL:
+*>   https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf
 *>
 *>\endverbatim
 *
@@ -167,15 +173,18 @@
 *  =====================================================================
 *
 *     .. Parameters ..
-      DOUBLE PRECISION   ALPHASQ, REALONE, REALZERO
-      PARAMETER          ( ALPHASQ = 0.01D0, REALONE = 1.0D0,
+      DOUBLE PRECISION   ALPHA, REALONE, REALZERO
+      PARAMETER          ( ALPHA = 0.01D0, REALONE = 1.0D0,
      $                     REALZERO = 0.0D0 )
       DOUBLE PRECISION   NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = -1.0D0, ONE = 1.0D0, ZERO = 0.0D0 )
 *     ..
 *     .. Local Scalars ..
-      INTEGER            I
-      DOUBLE PRECISION   NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2
+      INTEGER            I, IX
+      DOUBLE PRECISION   EPS, NORM, NORM_NEW, SCL, SSQ
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DGEMV, DLASSQ, XERBLA
@@ -210,17 +219,17 @@
          CALL XERBLA( 'DORBDB6', -INFO )
          RETURN
       END IF
+*
+      EPS = DLAMCH( 'Precision' )
 *
 *     First, project X onto the orthogonal complement of Q's column
 *     space
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL DLASSQ( M2, X2, INCX2, SCL2, SSQ2 )
-      NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+*     Christoph Conrads: In debugging mode the norm should be computed
+*     and an assertion added comparing the norm with one. Alas, Fortran
+*     never made it into 1989 when assert() was introduced into the C
+*     programming language.
+      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
@@ -238,27 +247,31 @@
       CALL DGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2,
      $            INCX2 )
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL DLASSQ( M2, X2, INCX2, SCL2, SSQ2 )
-      NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL DLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL DLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM_NEW = SCL * SQRT(SSQ)
 *
 *     If projection is sufficiently large in norm, then stop.
 *     If projection is zero, then stop.
 *     Otherwise, project again.
 *
-      IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN
+      IF( NORM_NEW .GE. ALPHA * NORM ) THEN
          RETURN
       END IF
 *
-      IF( NORMSQ2 .EQ. ZERO ) THEN
+      IF( NORM_NEW .LE. N * EPS * NORM ) THEN
+         DO IX = 1, 1 + (M1-1)*INCX1, INCX1
+           X1( IX ) = ZERO
+         END DO
+         DO IX = 1, 1 + (M2-1)*INCX2, INCX2
+           X2( IX ) = ZERO
+         END DO
          RETURN
       END IF
 *
-      NORMSQ1 = NORMSQ2
+      NORM = NORM_NEW
 *
       DO I = 1, N
          WORK(I) = ZERO
@@ -280,24 +293,22 @@
       CALL DGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2,
      $            INCX2 )
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL DLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL DLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL DLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM_NEW = SCL * SQRT(SSQ)
 *
 *     If second projection is sufficiently large in norm, then do
 *     nothing more. Alternatively, if it shrunk significantly, then
 *     truncate it to zero.
 *
-      IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN
-         DO I = 1, M1
-            X1(I) = ZERO
+      IF( NORM_NEW .LT. ALPHA * NORM ) THEN
+         DO IX = 1, 1 + (M1-1)*INCX1, INCX1
+            X1(IX) = ZERO
          END DO
-         DO I = 1, M2
-            X2(I) = ZERO
+         DO IX = 1, 1 + (M2-1)*INCX2, INCX2
+            X2(IX) = ZERO
          END DO
       END IF
 *
@@ -306,4 +317,3 @@
 *     End of DORBDB6
 *
       END
-
diff --git a/lapack-netlib/SRC/zunbdb6.f b/lapack-netlib/SRC/zunbdb6.f
index ec681b597..ed666e449 100644
--- a/lapack-netlib/SRC/zunbdb6.f
+++ b/lapack-netlib/SRC/zunbdb6.f
@@ -41,10 +41,16 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The columns of Q must be orthonormal.
+*> The Euclidean norm of X must be one and the columns of Q must be
+*> orthonormal. The orthogonalized vector will be zero if and only if it
+*> lies entirely in the range of Q.
 *>
-*> If the projection is zero according to Kahan's "twice is enough"
-*> criterion, then the zero vector is returned.
+*> The projection is computed with at most two iterations of the
+*> classical Gram-Schmidt algorithm, see
+*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error
+*>   analysis of the Gram-Schmidt algorithm with reorthogonalization."
+*>   2002. CERFACS Technical Report No. TR/PA/02/33. URL:
+*>   https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf
 *>
 *>\endverbatim
 *
@@ -167,16 +173,19 @@
 *  =====================================================================
 *
 *     .. Parameters ..
-      DOUBLE PRECISION   ALPHASQ, REALONE, REALZERO
-      PARAMETER          ( ALPHASQ = 0.01D0, REALONE = 1.0D0,
+      DOUBLE PRECISION   ALPHA, REALONE, REALZERO
+      PARAMETER          ( ALPHA = 0.01D0, REALONE = 1.0D0,
      $                     REALZERO = 0.0D0 )
       COMPLEX*16         NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = (-1.0D0,0.0D0), ONE = (1.0D0,0.0D0),
      $                     ZERO = (0.0D0,0.0D0) )
 *     ..
 *     .. Local Scalars ..
-      INTEGER            I
-      DOUBLE PRECISION   NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2
+      INTEGER            I, IX
+      DOUBLE PRECISION   EPS, NORM, NORM_NEW, SCL, SSQ
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ZGEMV, ZLASSQ, XERBLA
@@ -211,17 +220,17 @@
          CALL XERBLA( 'ZUNBDB6', -INFO )
          RETURN
       END IF
+*
+      EPS = DLAMCH( 'Precision' )
 *
 *     First, project X onto the orthogonal complement of Q's column
 *     space
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL ZLASSQ( M2, X2, INCX2, SCL2, SSQ2 )
-      NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+*     Christoph Conrads: In debugging mode the norm should be computed
+*     and an assertion added comparing the norm with one. Alas, Fortran
+*     never made it into 1989 when assert() was introduced into the C
+*     programming language.
+      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
@@ -239,27 +248,31 @@
       CALL ZGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2,
      $            INCX2 )
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL ZLASSQ( M2, X2, INCX2, SCL2, SSQ2 )
-      NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM_NEW = SCL * SQRT(SSQ)
 *
 *     If projection is sufficiently large in norm, then stop.
 *     If projection is zero, then stop.
 *     Otherwise, project again.
 *
-      IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN
+      IF( NORM_NEW .GE. ALPHA * NORM ) THEN
          RETURN
       END IF
 *
-      IF( NORMSQ2 .EQ. ZERO ) THEN
+      IF( NORM_NEW .LE. N * EPS * NORM ) THEN
+         DO IX = 1, 1 + (M1-1)*INCX1, INCX1
+           X1( IX ) = ZERO
+         END DO
+         DO IX = 1, 1 + (M2-1)*INCX2, INCX2
+           X2( IX ) = ZERO
+         END DO
          RETURN
       END IF
 *
-      NORMSQ1 = NORMSQ2
+      NORM = NORM_NEW
 *
       DO I = 1, N
          WORK(I) = ZERO
@@ -281,24 +294,22 @@
       CALL ZGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2,
      $            INCX2 )
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL ZLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM_NEW = SCL * SQRT(SSQ)
 *
 *     If second projection is sufficiently large in norm, then do
 *     nothing more. Alternatively, if it shrunk significantly, then
 *     truncate it to zero.
 *
-      IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN
-         DO I = 1, M1
-            X1(I) = ZERO
+      IF( NORM_NEW .LT. ALPHA * NORM ) THEN
+         DO IX = 1, 1 + (M1-1)*INCX1, INCX1
+            X1(IX) = ZERO
          END DO
-         DO I = 1, M2
-            X2(I) = ZERO
+         DO IX = 1, 1 + (M2-1)*INCX2, INCX2
+            X2(IX) = ZERO
          END DO
       END IF
 *
@@ -307,4 +318,3 @@
 *     End of ZUNBDB6
 *
       END
-

From b9468205021146e1f45f9c91e1bda9699ef68bf4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 16:36:19 +0100
Subject: [PATCH 124/154] Fix uninitialized variable (Reference-LAPACK PR647)

---
 lapack-netlib/SRC/cunbdb6.f | 84 +++++++++++++++++++++----------------
 lapack-netlib/SRC/sorbdb6.f | 84 +++++++++++++++++++++----------------
 2 files changed, 94 insertions(+), 74 deletions(-)

diff --git a/lapack-netlib/SRC/cunbdb6.f b/lapack-netlib/SRC/cunbdb6.f
index 7acc99cb8..b93a389d6 100644
--- a/lapack-netlib/SRC/cunbdb6.f
+++ b/lapack-netlib/SRC/cunbdb6.f
@@ -41,10 +41,16 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The columns of Q must be orthonormal.
+*> The Euclidean norm of X must be one and the columns of Q must be
+*> orthonormal. The orthogonalized vector will be zero if and only if it
+*> lies entirely in the range of Q.
 *>
-*> If the projection is zero according to Kahan's "twice is enough"
-*> criterion, then the zero vector is returned.
+*> The projection is computed with at most two iterations of the
+*> classical Gram-Schmidt algorithm, see
+*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error
+*>   analysis of the Gram-Schmidt algorithm with reorthogonalization."
+*>   2002. CERFACS Technical Report No. TR/PA/02/33. URL:
+*>   https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf
 *>
 *>\endverbatim
 *
@@ -167,16 +173,19 @@
 *  =====================================================================
 *
 *     .. Parameters ..
-      REAL               ALPHASQ, REALONE, REALZERO
-      PARAMETER          ( ALPHASQ = 0.01E0, REALONE = 1.0E0,
+      REAL               ALPHA, REALONE, REALZERO
+      PARAMETER          ( ALPHA = 0.01E0, REALONE = 1.0E0,
      $                     REALZERO = 0.0E0 )
       COMPLEX            NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = (-1.0E0,0.0E0), ONE = (1.0E0,0.0E0),
      $                     ZERO = (0.0E0,0.0E0) )
 *     ..
 *     .. Local Scalars ..
-      INTEGER            I
-      REAL               NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2
+      INTEGER            I, IX
+      REAL               EPS, NORM, NORM_NEW, SCL, SSQ
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMV, CLASSQ, XERBLA
@@ -211,17 +220,17 @@
          CALL XERBLA( 'CUNBDB6', -INFO )
          RETURN
       END IF
+*
+      EPS = SLAMCH( 'Precision' )
 *
 *     First, project X onto the orthogonal complement of Q's column
 *     space
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL CLASSQ( M2, X2, INCX2, SCL2, SSQ2 )
-      NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+*     Christoph Conrads: In debugging mode the norm should be computed
+*     and an assertion added comparing the norm with one. Alas, Fortran
+*     never made it into 1989 when assert() was introduced into the C
+*     programming language.
+      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
@@ -239,27 +248,31 @@
       CALL CGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2,
      $            INCX2 )
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL CLASSQ( M2, X2, INCX2, SCL2, SSQ2 )
-      NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL CLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL CLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM_NEW = SCL * SQRT(SSQ)
 *
 *     If projection is sufficiently large in norm, then stop.
 *     If projection is zero, then stop.
 *     Otherwise, project again.
 *
-      IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN
+      IF( NORM_NEW .GE. ALPHA * NORM ) THEN
          RETURN
       END IF
 *
-      IF( NORMSQ2 .EQ. ZERO ) THEN
+      IF( NORM_NEW .LE. N * EPS * NORM ) THEN
+         DO IX = 1, 1 + (M1-1)*INCX1, INCX1
+           X1( IX ) = ZERO
+         END DO
+         DO IX = 1, 1 + (M2-1)*INCX2, INCX2
+           X2( IX ) = ZERO
+         END DO
          RETURN
       END IF
 *
-      NORMSQ1 = NORMSQ2
+      NORM = NORM_NEW
 *
       DO I = 1, N
          WORK(I) = ZERO
@@ -281,24 +294,22 @@
       CALL CGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2,
      $            INCX2 )
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL CLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL CLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL CLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM_NEW = SCL * SQRT(SSQ)
 *
 *     If second projection is sufficiently large in norm, then do
 *     nothing more. Alternatively, if it shrunk significantly, then
 *     truncate it to zero.
 *
-      IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN
-         DO I = 1, M1
-            X1(I) = ZERO
+      IF( NORM_NEW .LT. ALPHA * NORM ) THEN
+         DO IX = 1, 1 + (M1-1)*INCX1, INCX1
+            X1(IX) = ZERO
          END DO
-         DO I = 1, M2
-            X2(I) = ZERO
+         DO IX = 1, 1 + (M2-1)*INCX2, INCX2
+            X2(IX) = ZERO
          END DO
       END IF
 *
@@ -307,4 +318,3 @@
 *     End of CUNBDB6
 *
       END
-
diff --git a/lapack-netlib/SRC/sorbdb6.f b/lapack-netlib/SRC/sorbdb6.f
index a23b42beb..b2449e3be 100644
--- a/lapack-netlib/SRC/sorbdb6.f
+++ b/lapack-netlib/SRC/sorbdb6.f
@@ -41,10 +41,16 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The columns of Q must be orthonormal.
+*> The Euclidean norm of X must be one and the columns of Q must be
+*> orthonormal. The orthogonalized vector will be zero if and only if it
+*> lies entirely in the range of Q.
 *>
-*> If the projection is zero according to Kahan's "twice is enough"
-*> criterion, then the zero vector is returned.
+*> The projection is computed with at most two iterations of the
+*> classical Gram-Schmidt algorithm, see
+*> * L. Giraud, J. Langou, M. Rozložník. "On the round-off error
+*>   analysis of the Gram-Schmidt algorithm with reorthogonalization."
+*>   2002. CERFACS Technical Report No. TR/PA/02/33. URL:
+*>   https://www.cerfacs.fr/algor/reports/2002/TR_PA_02_33.pdf
 *>
 *>\endverbatim
 *
@@ -167,15 +173,18 @@
 *  =====================================================================
 *
 *     .. Parameters ..
-      REAL               ALPHASQ, REALONE, REALZERO
-      PARAMETER          ( ALPHASQ = 0.01E0, REALONE = 1.0E0,
+      REAL               ALPHA, REALONE, REALZERO
+      PARAMETER          ( ALPHA = 0.01E0, REALONE = 1.0E0,
      $                     REALZERO = 0.0E0 )
       REAL               NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = -1.0E0, ONE = 1.0E0, ZERO = 0.0E0 )
 *     ..
 *     .. Local Scalars ..
-      INTEGER            I
-      REAL               NORMSQ1, NORMSQ2, SCL1, SCL2, SSQ1, SSQ2
+      INTEGER            I, IX
+      REAL               EPS, NORM, NORM_NEW, SCL, SSQ
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMV, SLASSQ, XERBLA
@@ -210,17 +219,17 @@
          CALL XERBLA( 'SORBDB6', -INFO )
          RETURN
       END IF
+*
+      EPS = SLAMCH( 'Precision' )
 *
 *     First, project X onto the orthogonal complement of Q's column
 *     space
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL SLASSQ( M2, X2, INCX2, SCL2, SSQ2 )
-      NORMSQ1 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+*     Christoph Conrads: In debugging mode the norm should be computed
+*     and an assertion added comparing the norm with one. Alas, Fortran
+*     never made it into 1989 when assert() was introduced into the C
+*     programming language.
+      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
@@ -238,27 +247,31 @@
       CALL SGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2,
      $            INCX2 )
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL SLASSQ( M2, X2, INCX2, SCL2, SSQ2 )
-      NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL SLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL SLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM_NEW = SCL * SQRT(SSQ)
 *
 *     If projection is sufficiently large in norm, then stop.
 *     If projection is zero, then stop.
 *     Otherwise, project again.
 *
-      IF( NORMSQ2 .GE. ALPHASQ*NORMSQ1 ) THEN
+      IF( NORM_NEW .GE. ALPHA * NORM ) THEN
          RETURN
       END IF
 *
-      IF( NORMSQ2 .EQ. ZERO ) THEN
+      IF( NORM_NEW .LE. N * EPS * NORM ) THEN
+         DO IX = 1, 1 + (M1-1)*INCX1, INCX1
+           X1( IX ) = ZERO
+         END DO
+         DO IX = 1, 1 + (M2-1)*INCX2, INCX2
+           X2( IX ) = ZERO
+         END DO
          RETURN
       END IF
 *
-      NORMSQ1 = NORMSQ2
+      NORM = NORM_NEW
 *
       DO I = 1, N
          WORK(I) = ZERO
@@ -280,24 +293,22 @@
       CALL SGEMV( 'N', M2, N, NEGONE, Q2, LDQ2, WORK, 1, ONE, X2,
      $            INCX2 )
 *
-      SCL1 = REALZERO
-      SSQ1 = REALONE
-      CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      SCL2 = REALZERO
-      SSQ2 = REALONE
-      CALL SLASSQ( M1, X1, INCX1, SCL1, SSQ1 )
-      NORMSQ2 = SCL1**2*SSQ1 + SCL2**2*SSQ2
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL SLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL SLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM_NEW = SCL * SQRT(SSQ)
 *
 *     If second projection is sufficiently large in norm, then do
 *     nothing more. Alternatively, if it shrunk significantly, then
 *     truncate it to zero.
 *
-      IF( NORMSQ2 .LT. ALPHASQ*NORMSQ1 ) THEN
-         DO I = 1, M1
-            X1(I) = ZERO
+      IF( NORM_NEW .LT. ALPHA * NORM ) THEN
+         DO IX = 1, 1 + (M1-1)*INCX1, INCX1
+            X1(IX) = ZERO
          END DO
-         DO I = 1, M2
-            X2(I) = ZERO
+         DO IX = 1, 1 + (M2-1)*INCX2, INCX2
+            X2(IX) = ZERO
          END DO
       END IF
 *
@@ -306,4 +317,3 @@
 *     End of SORBDB6
 *
       END
-

From aaea0804bcb0318e96d3fcfd32dc66570c633c4a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 16:38:57 +0100
Subject: [PATCH 125/154] Fix function documentation (Reference-LAPACK PR697)

---
 lapack-netlib/SRC/cunbdb2.f | 4 ++--
 lapack-netlib/SRC/cunbdb4.f | 4 ++--
 lapack-netlib/SRC/dorbdb2.f | 4 ++--
 lapack-netlib/SRC/dorbdb4.f | 4 ++--
 lapack-netlib/SRC/sorbdb2.f | 4 ++--
 lapack-netlib/SRC/sorbdb4.f | 4 ++--
 lapack-netlib/SRC/zunbdb2.f | 4 ++--
 lapack-netlib/SRC/zunbdb4.f | 4 ++--
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f
index db238f925..b45db6100 100644
--- a/lapack-netlib/SRC/cunbdb2.f
+++ b/lapack-netlib/SRC/cunbdb2.f
@@ -122,14 +122,14 @@
 *>
 *> \param[out] TAUP1
 *> \verbatim
-*>          TAUP1 is COMPLEX array, dimension (P)
+*>          TAUP1 is COMPLEX array, dimension (P-1)
 *>           The scalar factors of the elementary reflectors that define
 *>           P1.
 *> \endverbatim
 *>
 *> \param[out] TAUP2
 *> \verbatim
-*>          TAUP2 is COMPLEX array, dimension (M-P)
+*>          TAUP2 is COMPLEX array, dimension (Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P2.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f
index e6afd89c3..117f23d08 100644
--- a/lapack-netlib/SRC/cunbdb4.f
+++ b/lapack-netlib/SRC/cunbdb4.f
@@ -124,14 +124,14 @@
 *>
 *> \param[out] TAUP1
 *> \verbatim
-*>          TAUP1 is COMPLEX array, dimension (P)
+*>          TAUP1 is COMPLEX array, dimension (M-Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P1.
 *> \endverbatim
 *>
 *> \param[out] TAUP2
 *> \verbatim
-*>          TAUP2 is COMPLEX array, dimension (M-P)
+*>          TAUP2 is COMPLEX array, dimension (M-Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P2.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dorbdb2.f b/lapack-netlib/SRC/dorbdb2.f
index 64e4645bc..a0dacbb16 100644
--- a/lapack-netlib/SRC/dorbdb2.f
+++ b/lapack-netlib/SRC/dorbdb2.f
@@ -122,14 +122,14 @@
 *>
 *> \param[out] TAUP1
 *> \verbatim
-*>          TAUP1 is DOUBLE PRECISION array, dimension (P)
+*>          TAUP1 is DOUBLE PRECISION array, dimension (P-1)
 *>           The scalar factors of the elementary reflectors that define
 *>           P1.
 *> \endverbatim
 *>
 *> \param[out] TAUP2
 *> \verbatim
-*>          TAUP2 is DOUBLE PRECISION array, dimension (M-P)
+*>          TAUP2 is DOUBLE PRECISION array, dimension (Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P2.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dorbdb4.f b/lapack-netlib/SRC/dorbdb4.f
index a09568415..08604be45 100644
--- a/lapack-netlib/SRC/dorbdb4.f
+++ b/lapack-netlib/SRC/dorbdb4.f
@@ -124,14 +124,14 @@
 *>
 *> \param[out] TAUP1
 *> \verbatim
-*>          TAUP1 is DOUBLE PRECISION array, dimension (P)
+*>          TAUP1 is DOUBLE PRECISION array, dimension (M-Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P1.
 *> \endverbatim
 *>
 *> \param[out] TAUP2
 *> \verbatim
-*>          TAUP2 is DOUBLE PRECISION array, dimension (M-P)
+*>          TAUP2 is DOUBLE PRECISION array, dimension (M-Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P2.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sorbdb2.f b/lapack-netlib/SRC/sorbdb2.f
index ad3eb269d..484d352f8 100644
--- a/lapack-netlib/SRC/sorbdb2.f
+++ b/lapack-netlib/SRC/sorbdb2.f
@@ -122,14 +122,14 @@
 *>
 *> \param[out] TAUP1
 *> \verbatim
-*>          TAUP1 is REAL array, dimension (P)
+*>          TAUP1 is REAL array, dimension (P-1)
 *>           The scalar factors of the elementary reflectors that define
 *>           P1.
 *> \endverbatim
 *>
 *> \param[out] TAUP2
 *> \verbatim
-*>          TAUP2 is REAL array, dimension (M-P)
+*>          TAUP2 is REAL array, dimension (Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P2.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sorbdb4.f b/lapack-netlib/SRC/sorbdb4.f
index b18ed3b27..bf60fb7bb 100644
--- a/lapack-netlib/SRC/sorbdb4.f
+++ b/lapack-netlib/SRC/sorbdb4.f
@@ -124,14 +124,14 @@
 *>
 *> \param[out] TAUP1
 *> \verbatim
-*>          TAUP1 is REAL array, dimension (P)
+*>          TAUP1 is REAL array, dimension (M-Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P1.
 *> \endverbatim
 *>
 *> \param[out] TAUP2
 *> \verbatim
-*>          TAUP2 is REAL array, dimension (M-P)
+*>          TAUP2 is REAL array, dimension (M-Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P2.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zunbdb2.f b/lapack-netlib/SRC/zunbdb2.f
index 412d8d8d0..46b08aa1e 100644
--- a/lapack-netlib/SRC/zunbdb2.f
+++ b/lapack-netlib/SRC/zunbdb2.f
@@ -122,14 +122,14 @@
 *>
 *> \param[out] TAUP1
 *> \verbatim
-*>          TAUP1 is COMPLEX*16 array, dimension (P)
+*>          TAUP1 is COMPLEX*16 array, dimension (P-1)
 *>           The scalar factors of the elementary reflectors that define
 *>           P1.
 *> \endverbatim
 *>
 *> \param[out] TAUP2
 *> \verbatim
-*>          TAUP2 is COMPLEX*16 array, dimension (M-P)
+*>          TAUP2 is COMPLEX*16 array, dimension (Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P2.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zunbdb4.f b/lapack-netlib/SRC/zunbdb4.f
index b1fcd8bd0..4672cfa67 100644
--- a/lapack-netlib/SRC/zunbdb4.f
+++ b/lapack-netlib/SRC/zunbdb4.f
@@ -124,14 +124,14 @@
 *>
 *> \param[out] TAUP1
 *> \verbatim
-*>          TAUP1 is COMPLEX*16 array, dimension (P)
+*>          TAUP1 is COMPLEX*16 array, dimension (M-Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P1.
 *> \endverbatim
 *>
 *> \param[out] TAUP2
 *> \verbatim
-*>          TAUP2 is COMPLEX*16 array, dimension (M-P)
+*>          TAUP2 is COMPLEX*16 array, dimension (M-Q)
 *>           The scalar factors of the elementary reflectors that define
 *>           P2.
 *> \endverbatim

From 6f09e4c1212db6f33407efd8bb4588335c626fee Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 19:37:28 +0100
Subject: [PATCH 126/154] Improve FMA usage in ?LAQR5 (Reference-LAPACK PR681)

---
 lapack-netlib/SRC/claqr5.f |  96 ++++++++++++++++++----------------
 lapack-netlib/SRC/dlaqr5.f | 102 +++++++++++++++++++++----------------
 lapack-netlib/SRC/slaqr5.f | 102 +++++++++++++++++++++----------------
 lapack-netlib/SRC/zlaqr5.f |  97 +++++++++++++++++++----------------
 4 files changed, 223 insertions(+), 174 deletions(-)

diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f
index 95cc33b9d..0a01cc226 100644
--- a/lapack-netlib/SRC/claqr5.f
+++ b/lapack-netlib/SRC/claqr5.f
@@ -279,7 +279,7 @@
       PARAMETER          ( RZERO = 0.0e0, RONE = 1.0e0 )
 *     ..
 *     .. Local Scalars ..
-      COMPLEX            ALPHA, BETA, CDUM, REFSUM
+      COMPLEX            ALPHA, BETA, CDUM, REFSUM, T1, T2, T3
       REAL               H11, H12, H21, H22, SAFMAX, SAFMIN, SCL,
      $                   SMLNUM, TST1, TST2, ULP
       INTEGER            I2, I4, INCOL, J, JBOT, JCOL, JLEN,
@@ -424,12 +424,12 @@
 *              ==== Perform update from right within 
 *              .    computational window. ====
 *
+               T1 = V( 1, M22 )
+               T2 = T1*CONJG( V( 2, M22 ) )
                DO 30 J = JTOP, MIN( KBOT, K+3 )
-                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                     H( J, K+2 ) )
-                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                  H( J, K+2 ) = H( J, K+2 ) -
-     $                          REFSUM*CONJG( V( 2, M22 ) )
+                  REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
    30          CONTINUE
 *
 *              ==== Perform update from left within 
@@ -442,12 +442,13 @@
                ELSE
                   JBOT = KBOT
                END IF
+               T1 = CONJG( V( 1, M22 ) )
+               T2 = T1*V( 2, M22 )
                DO 40 J = K+1, JBOT
-                  REFSUM = CONJG( V( 1, M22 ) )*
-     $                     ( H( K+1, J )+CONJG( V( 2, M22 ) )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+                  REFSUM = H( K+1, J ) +
+     $                     CONJG( V( 2, M22 ) )*H( K+2, J )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM*T1
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*T2
    40          CONTINUE
 *
 *              ==== The following convergence test requires that
@@ -610,25 +611,28 @@
 *              .     deflation check. We still delay most of the
 *              .     updates from the left for efficiency. ====
 *
+               T1 = V( 1, M )
+               T2 = T1*CONJG( V( 2, M ) )
+               T3 = T1*CONJG( V( 3, M ) )
                DO 70 J = JTOP, MIN( KBOT, K+3 )
-                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
-     $                     H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                  H( J, K+2 ) = H( J, K+2 ) -
-     $                          REFSUM*CONJG( V( 2, M ) )
-                  H( J, K+3 ) = H( J, K+3 ) -
-     $                          REFSUM*CONJG( V( 3, M ) )
+                  REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 )
+     $                     + V( 3, M )*H( J, K+3 )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
+                  H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3
    70          CONTINUE
 *
 *              ==== Perform update from left for subsequent
 *              .    column. ====
 *
-               REFSUM =  CONJG( V( 1, M ) )*( H( K+1, K+1 )
-     $                  +CONJG( V( 2, M ) )*H( K+2, K+1 )
-     $                  +CONJG( V( 3, M ) )*H( K+3, K+1 ) )
-               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
-               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
-               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
+               T1 = CONJG( V( 1, M ) )
+               T2 = T1*V( 2, M )
+               T3 = T1*V( 3, M )
+               REFSUM = H( K+1, K+1 ) + CONJG( V( 2, M ) )*H( K+2, K+1 )
+     $                  + CONJG( V( 3, M ) )*H( K+3, K+1 )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -688,13 +692,15 @@
 *
             DO 100 M = MBOT, MTOP, -1
                K = KRCOL + 2*( M-1 )
+               T1 = CONJG( V( 1, M ) )
+               T2 = T1*V( 2, M )
+               T3 = T1*V( 3, M )
                DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
-                  REFSUM = CONJG( V( 1, M ) )*
-     $                     ( H( K+1, J )+CONJG( V( 2, M ) )*
-     $                     H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+                  REFSUM = H( K+1, J ) + CONJG( V( 2, M ) )*
+     $                     H( K+2, J ) + CONJG( V( 3, M ) )*H( K+3, J )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM*T1
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*T2
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*T3
    90          CONTINUE
   100       CONTINUE
 *
@@ -712,14 +718,15 @@
                   I2 = MAX( 1, KTOP-INCOL )
                   I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
                   I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  T1 = V( 1, M )
+                  T2 = T1*CONJG( V( 2, M ) )
+                  T3 = T1*CONJG( V( 3, M ) )
                   DO 110 J = I2, I4
-                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                     U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                               REFSUM*CONJG( V( 2, M ) )
-                     U( J, KMS+3 ) = U( J, KMS+3 ) -
-     $                               REFSUM*CONJG( V( 3, M ) )
+                     REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 )
+     $                        + V( 3, M )*U( J, KMS+3 )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
+                     U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3
   110             CONTINUE
   120          CONTINUE
             ELSE IF( WANTZ ) THEN
@@ -730,14 +737,15 @@
 *
                DO 140 M = MBOT, MTOP, -1
                   K = KRCOL + 2*( M-1 )
+                  T1 = V( 1, M )
+                  T2 = T1*CONJG( V( 2, M ) )
+                  T3 = T1*CONJG( V( 3, M ) )
                   DO 130 J = ILOZ, IHIZ
-                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                     Z( J, K+2 ) = Z( J, K+2 ) -
-     $                             REFSUM*CONJG( V( 2, M ) )
-                     Z( J, K+3 ) = Z( J, K+3 ) -
-     $                             REFSUM*CONJG( V( 3, M ) )
+                     REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 )
+     $                        + V( 3, M )*Z( J, K+3 )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
+                     Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3
   130             CONTINUE
   140          CONTINUE
             END IF
diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f
index 0c63ab800..43b4ac72a 100644
--- a/lapack-netlib/SRC/dlaqr5.f
+++ b/lapack-netlib/SRC/dlaqr5.f
@@ -286,8 +286,8 @@
 *     ..
 *     .. Local Scalars ..
       DOUBLE PRECISION   ALPHA, BETA, H11, H12, H21, H22, REFSUM,
-     $                   SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2,
-     $                   ULP
+     $                   SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2,
+     $                   T3, TST1, TST2, ULP
       INTEGER            I, I2, I4, INCOL, J, JBOT, JCOL, JLEN,
      $                   JROW, JTOP, K, K1, KDU, KMS, KRCOL,
      $                   M, M22, MBOT, MTOP, NBMPS, NDCOL,
@@ -447,11 +447,12 @@
 *              ==== Perform update from right within 
 *              .    computational window. ====
 *
+               T1 = V( 1, M22 )
+               T2 = T1*V( 2, M22 )
                DO 30 J = JTOP, MIN( KBOT, K+3 )
-                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                     H( J, K+2 ) )
-                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
+                  REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
    30          CONTINUE
 *
 *              ==== Perform update from left within 
@@ -464,11 +465,12 @@
                ELSE
                   JBOT = KBOT
                END IF
+               T1 = V( 1, M22 )
+               T2 = T1*V( 2, M22 )
                DO 40 J = K+1, JBOT
-                  REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+                  REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM*T1
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*T2
    40          CONTINUE
 *
 *              ==== The following convergence test requires that
@@ -522,18 +524,20 @@
 *
                IF( ACCUM ) THEN
                   KMS = K - INCOL
+                  T1 = V( 1, M22 )
+                  T2 = T1*V( 2, M22 )
                   DO 50 J = MAX( 1, KTOP-INCOL ), KDU
-                     REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
-     $                        V( 2, M22 )*U( J, KMS+2 ) )
-                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 )
+                     REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
   50                 CONTINUE
                ELSE IF( WANTZ ) THEN
+                  T1 = V( 1, M22 )
+                  T2 = T1*V( 2, M22 )
                   DO 60 J = ILOZ, IHIZ
-                     REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
-     $                        Z( J, K+2 ) )
-                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
+                     REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
   60              CONTINUE
                END IF
             END IF
@@ -631,22 +635,25 @@
 *              .     deflation check. We still delay most of the
 *              .     updates from the left for efficiency. ====      
 *
+               T1 = V( 1, M )
+               T2 = T1*V( 2, M )
+               T3 = T1*V( 3, M )
                DO 70 J = JTOP, MIN( KBOT, K+3 )
-                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
-     $                     H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
-                  H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
+                  REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 )
+     $                     + V( 3, M )*H( J, K+3 )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
+                  H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3
    70          CONTINUE
 *
 *              ==== Perform update from left for subsequent
 *              .    column. ====
 *
-               REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )*
-     $                  H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) )
-               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
-               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
-               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
+               REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 )
+     $                  + V( 3, M )*H( K+3, K+1 )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -706,12 +713,15 @@
 *
             DO 100 M = MBOT, MTOP, -1
                K = KRCOL + 2*( M-1 )
+               T1 = V( 1, M )
+               T2 = T1*V( 2, M )
+               T3 = T1*V( 3, M )
                DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
-                  REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
-     $                     H( K+2, J )+V( 3, M )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+                  REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J )
+     $                     + V( 3, M )*H( K+3, J )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM*T1
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*T2
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*T3
    90          CONTINUE
   100       CONTINUE
 *
@@ -729,12 +739,15 @@
                   I2 = MAX( 1, KTOP-INCOL )
                   I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
                   I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  T1 = V( 1, M )
+                  T2 = T1*V( 2, M )
+                  T3 = T1*V( 3, M )
                   DO 110 J = I2, I4
-                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
-                     U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
+                     REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 )
+     $                        + V( 3, M )*U( J, KMS+3 )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
+                     U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3
   110             CONTINUE
   120          CONTINUE
             ELSE IF( WANTZ ) THEN
@@ -745,12 +758,15 @@
 *
                DO 140 M = MBOT, MTOP, -1
                   K = KRCOL + 2*( M-1 )
+                  T1 = V( 1, M )
+                  T2 = T1*V( 2, M )
+                  T3 = T1*V( 3, M )
                   DO 130 J = ILOZ, IHIZ
-                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
-                     Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
+                     REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 )
+     $                        + V( 3, M )*Z( J, K+3 )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
+                     Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3
   130             CONTINUE
   140          CONTINUE
             END IF
diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f
index b9bae9376..a4f805674 100644
--- a/lapack-netlib/SRC/slaqr5.f
+++ b/lapack-netlib/SRC/slaqr5.f
@@ -286,8 +286,8 @@
 *     ..
 *     .. Local Scalars ..
       REAL               ALPHA, BETA, H11, H12, H21, H22, REFSUM,
-     $                   SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2,
-     $                   ULP
+     $                   SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, T1, T2,
+     $                   T3, TST1, TST2, ULP
       INTEGER            I, I2, I4, INCOL, J, JBOT, JCOL, JLEN,
      $                   JROW, JTOP, K, K1, KDU, KMS, KRCOL,
      $                   M, M22, MBOT, MTOP, NBMPS, NDCOL,
@@ -447,11 +447,12 @@
 *              ==== Perform update from right within 
 *              .    computational window. ====
 *
+               T1 = V( 1, M22 )
+               T2 = T1*V( 2, M22 )
                DO 30 J = JTOP, MIN( KBOT, K+3 )
-                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                     H( J, K+2 ) )
-                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 )
+                  REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
    30          CONTINUE
 *
 *              ==== Perform update from left within 
@@ -464,11 +465,12 @@
                ELSE
                   JBOT = KBOT
                END IF
+               T1 = V( 1, M22 )
+               T2 = T1*V( 2, M22 )
                DO 40 J = K+1, JBOT
-                  REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+                  REFSUM = H( K+1, J ) + V( 2, M22 )*H( K+2, J )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM*T1
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*T2
    40          CONTINUE
 *
 *              ==== The following convergence test requires that
@@ -522,18 +524,20 @@
 *
                IF( ACCUM ) THEN
                   KMS = K - INCOL
+                  T1 = V( 1, M22 )
+                  T2 = T1*V( 2, M22 )
                   DO 50 J = MAX( 1, KTOP-INCOL ), KDU
-                     REFSUM = V( 1, M22 )*( U( J, KMS+1 )+
-     $                        V( 2, M22 )*U( J, KMS+2 ) )
-                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 )
+                     REFSUM = U( J, KMS+1 ) + V( 2, M22 )*U( J, KMS+2 )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
   50                 CONTINUE
                ELSE IF( WANTZ ) THEN
+                  T1 = V( 1, M22 )
+                  T2 = T1*V( 2, M22 )
                   DO 60 J = ILOZ, IHIZ
-                     REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )*
-     $                        Z( J, K+2 ) )
-                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 )
+                     REFSUM = Z( J, K+1 )+V( 2, M22 )*Z( J, K+2 )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
   60              CONTINUE
                END IF
             END IF
@@ -631,22 +635,25 @@
 *              .     deflation check. We still delay most of the
 *              .     updates from the left for efficiency. ====      
 *
+               T1 = V( 1, M )
+               T2 = T1*V( 2, M )
+               T3 = T1*V( 3, M )
                DO 70 J = JTOP, MIN( KBOT, K+3 )
-                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
-     $                        H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M )
-                  H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M )
+                  REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 )
+     $                     + V( 3, M )*H( J, K+3 )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
+                  H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3
    70          CONTINUE
 *
 *              ==== Perform update from left for subsequent
 *              .    column. ====
 *
-               REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )*
-     $                  H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) )
-               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
-               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
-               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
+               REFSUM = H( K+1, K+1 ) + V( 2, M )*H( K+2, K+1 )
+     $                  + V( 3, M )*H( K+3, K+1 )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -706,12 +713,15 @@
 *
             DO 100 M = MBOT, MTOP, -1
                K = KRCOL + 2*( M-1 )
+               T1 = V( 1, M )
+               T2 = T1*V( 2, M )
+               T3 = T1*V( 3, M )
                DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
-                  REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )*
-     $                     H( K+2, J )+V( 3, M )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+                  REFSUM = H( K+1, J ) + V( 2, M )*H( K+2, J )
+     $                     + V( 3, M )*H( K+3, J )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM*T1
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*T2
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*T3
    90          CONTINUE
   100       CONTINUE
 *
@@ -729,12 +739,15 @@
                   I2 = MAX( 1, KTOP-INCOL )
                   I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
                   I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  T1 = V( 1, M )
+                  T2 = T1*V( 2, M )
+                  T3 = T1*V( 3, M )
                   DO 110 J = I2, I4
-                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M )
-                     U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M )
+                     REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 )
+     $                        + V( 3, M )*U( J, KMS+3 )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
+                     U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3
   110             CONTINUE
   120          CONTINUE
             ELSE IF( WANTZ ) THEN
@@ -745,12 +758,15 @@
 *
                DO 140 M = MBOT, MTOP, -1
                   K = KRCOL + 2*( M-1 )
+                  T1 = V( 1, M )
+                  T2 = T1*V( 2, M )
+                  T3 = T1*V( 3, M )
                   DO 130 J = ILOZ, IHIZ
-                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M )
-                     Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M )
+                     REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 )
+     $                        + V( 3, M )*Z( J, K+3 )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
+                     Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3
   130             CONTINUE
   140          CONTINUE
             END IF
diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f
index 3185508bc..4fa5ee5b0 100644
--- a/lapack-netlib/SRC/zlaqr5.f
+++ b/lapack-netlib/SRC/zlaqr5.f
@@ -279,7 +279,7 @@
       PARAMETER          ( RZERO = 0.0d0, RONE = 1.0d0 )
 *     ..
 *     .. Local Scalars ..
-      COMPLEX*16         ALPHA, BETA, CDUM, REFSUM
+      COMPLEX*16         ALPHA, BETA, CDUM, REFSUM, T1, T2, T3
       DOUBLE PRECISION   H11, H12, H21, H22, SAFMAX, SAFMIN, SCL,
      $                   SMLNUM, TST1, TST2, ULP
       INTEGER            I2, I4, INCOL, J, JBOT, JCOL, JLEN,
@@ -424,12 +424,12 @@
 *              ==== Perform update from right within 
 *              .    computational window. ====
 *
+               T1 = V( 1, M22 )
+               T2 = T1*DCONJG( V( 2, M22 ) )
                DO 30 J = JTOP, MIN( KBOT, K+3 )
-                  REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )*
-     $                     H( J, K+2 ) )
-                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                  H( J, K+2 ) = H( J, K+2 ) -
-     $                          REFSUM*DCONJG( V( 2, M22 ) )
+                  REFSUM = H( J, K+1 ) + V( 2, M22 )*H( J, K+2 )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
    30          CONTINUE
 *
 *              ==== Perform update from left within 
@@ -442,12 +442,13 @@
                ELSE
                   JBOT = KBOT
                END IF
+               T1 = DCONJG( V( 1, M22 ) )
+               T2 = T1*V( 2, M22 )
                DO 40 J = K+1, JBOT
-                  REFSUM = DCONJG( V( 1, M22 ) )*
-     $                     ( H( K+1, J )+DCONJG( V( 2, M22 ) )*
-     $                     H( K+2, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 )
+                  REFSUM = H( K+1, J ) +
+     $                     DCONJG( V( 2, M22 ) )*H( K+2, J )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM*T1
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*T2
    40          CONTINUE
 *
 *              ==== The following convergence test requires that
@@ -610,25 +611,29 @@
 *              .     deflation check. We still delay most of the
 *              .     updates from the left for efficiency. ====
 *
+               T1 = V( 1, M )
+               T2 = T1*DCONJG( V( 2, M ) )
+               T3 = T1*DCONJG( V( 3, M ) )
                DO 70 J = JTOP, MIN( KBOT, K+3 )
-                  REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )*
-     $                     H( J, K+2 )+V( 3, M )*H( J, K+3 ) )
-                  H( J, K+1 ) = H( J, K+1 ) - REFSUM
-                  H( J, K+2 ) = H( J, K+2 ) -
-     $                          REFSUM*DCONJG( V( 2, M ) )
-                  H( J, K+3 ) = H( J, K+3 ) -
-     $                          REFSUM*DCONJG( V( 3, M ) )
+                  REFSUM = H( J, K+1 ) + V( 2, M )*H( J, K+2 )
+     $                     + V( 3, M )*H( J, K+3 )
+                  H( J, K+1 ) = H( J, K+1 ) - REFSUM*T1
+                  H( J, K+2 ) = H( J, K+2 ) - REFSUM*T2
+                  H( J, K+3 ) = H( J, K+3 ) - REFSUM*T3
    70          CONTINUE
 *
 *              ==== Perform update from left for subsequent
 *              .    column. ====
 *
-               REFSUM =  DCONJG( V( 1, M ) )*( H( K+1, K+1 )
-     $                  +DCONJG( V( 2, M ) )*H( K+2, K+1 )
-     $                  +DCONJG( V( 3, M ) )*H( K+3, K+1 ) )
-               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM
-               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M )
-               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M )
+               T1 = DCONJG( V( 1, M ) )
+               T2 = T1*V( 2, M )
+               T3 = T1*V( 3, M )
+               REFSUM = H( K+1, K+1 )
+     $                  + DCONJG( V( 2, M ) )*H( K+2, K+1 )
+     $                  + DCONJG( V( 3, M ) )*H( K+3, K+1 )
+               H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM*T1
+               H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*T2
+               H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*T3
 *
 *              ==== The following convergence test requires that
 *              .    the tradition small-compared-to-nearby-diagonals
@@ -688,13 +693,15 @@
 *
             DO 100 M = MBOT, MTOP, -1
                K = KRCOL + 2*( M-1 )
+               T1 = DCONJG( V( 1, M ) )
+               T2 = T1*V( 2, M )
+               T3 = T1*V( 3, M )
                DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT
-                  REFSUM = DCONJG( V( 1, M ) )*
-     $                     ( H( K+1, J )+DCONJG( V( 2, M ) )*
-     $                     H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) )
-                  H( K+1, J ) = H( K+1, J ) - REFSUM
-                  H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M )
-                  H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M )
+                  REFSUM = H( K+1, J ) + DCONJG( V( 2, M ) )*H( K+2, J )
+     $                     + DCONJG( V( 3, M ) )*H( K+3, J )
+                  H( K+1, J ) = H( K+1, J ) - REFSUM*T1
+                  H( K+2, J ) = H( K+2, J ) - REFSUM*T2
+                  H( K+3, J ) = H( K+3, J ) - REFSUM*T3
    90          CONTINUE
   100       CONTINUE
 *
@@ -712,14 +719,15 @@
                   I2 = MAX( 1, KTOP-INCOL )
                   I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 )
                   I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 )
+                  T1 = V( 1, M )
+                  T2 = T1*DCONJG( V( 2, M ) )
+                  T3 = T1*DCONJG( V( 3, M ) )
                   DO 110 J = I2, I4
-                     REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )*
-     $                        U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) )
-                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM
-                     U( J, KMS+2 ) = U( J, KMS+2 ) -
-     $                               REFSUM*DCONJG( V( 2, M ) )
-                     U( J, KMS+3 ) = U( J, KMS+3 ) -
-     $                               REFSUM*DCONJG( V( 3, M ) )
+                     REFSUM = U( J, KMS+1 ) + V( 2, M )*U( J, KMS+2 )
+     $                        + V( 3, M )*U( J, KMS+3 )
+                     U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM*T1
+                     U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*T2
+                     U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*T3
   110             CONTINUE
   120          CONTINUE
             ELSE IF( WANTZ ) THEN
@@ -730,14 +738,15 @@
 *
                DO 140 M = MBOT, MTOP, -1
                   K = KRCOL + 2*( M-1 )
+                  T1 = V( 1, M )
+                  T2 = T1*DCONJG( V( 2, M ) )
+                  T3 = T1*DCONJG( V( 3, M ) )
                   DO 130 J = ILOZ, IHIZ
-                     REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )*
-     $                        Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) )
-                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM
-                     Z( J, K+2 ) = Z( J, K+2 ) -
-     $                             REFSUM*DCONJG( V( 2, M ) )
-                     Z( J, K+3 ) = Z( J, K+3 ) -
-     $                             REFSUM*DCONJG( V( 3, M ) )
+                     REFSUM = Z( J, K+1 ) + V( 2, M )*Z( J, K+2 )
+     $                        + V( 3, M )*Z( J, K+3 )
+                     Z( J, K+1 ) = Z( J, K+1 ) - REFSUM*T1
+                     Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*T2
+                     Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*T3
   130             CONTINUE
   140          CONTINUE
             END IF

From c6816bb5760827fb073fd65db49ee2178933e20d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 19:39:12 +0100
Subject: [PATCH 127/154] Use normwise criterion in multishift QZ
 (Reference-LAPACK PR698)

---
 lapack-netlib/SRC/claqz0.f | 16 ++++++----------
 lapack-netlib/SRC/dlaqz0.f | 16 ++++++----------
 lapack-netlib/SRC/slaqz0.f | 17 +++++++----------
 lapack-netlib/SRC/zlaqz0.f | 17 +++++++----------
 4 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/lapack-netlib/SRC/claqz0.f b/lapack-netlib/SRC/claqz0.f
index 2284fd65d..9cc25c6dc 100644
--- a/lapack-netlib/SRC/claqz0.f
+++ b/lapack-netlib/SRC/claqz0.f
@@ -299,7 +299,7 @@
       PARAMETER( ZERO = 0.0, ONE = 1.0, HALF = 0.5 )
 
 *     Local scalars
-      REAL :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR
+      REAL :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR, BNORM, BTOL
       COMPLEX :: ESHIFT, S1, TEMP
       INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS,
      $           NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED,
@@ -312,7 +312,7 @@
 *     External Functions
       EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, SLABAD,
      $            CLARTG, CROT
-      REAL, EXTERNAL :: SLAMCH
+      REAL, EXTERNAL :: SLAMCH, CLANHS
       LOGICAL, EXTERNAL :: LSAME
       INTEGER, EXTERNAL :: ILAENV
 
@@ -466,6 +466,9 @@
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N )/ULP )
 
+      BNORM = CLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, RWORK )
+      BTOL = MAX( SAFMIN, ULP*BNORM )
+
       ISTART = ILO
       ISTOP = IHI
       MAXIT = 30*( IHI-ILO+1 )
@@ -528,15 +531,8 @@
 *        slow down the method when many infinite eigenvalues are present
          K = ISTOP
          DO WHILE ( K.GE.ISTART2 )
-            TEMPR = ZERO
-            IF( K .LT. ISTOP ) THEN
-               TEMPR = TEMPR+ABS( B( K, K+1 ) )
-            END IF
-            IF( K .GT. ISTART2 ) THEN
-               TEMPR = TEMPR+ABS( B( K-1, K ) )
-            END IF
 
-            IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMPR ) ) THEN
+            IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
 *              A diagonal element of B is negligable, move it
 *              to the top and deflate it
                
diff --git a/lapack-netlib/SRC/dlaqz0.f b/lapack-netlib/SRC/dlaqz0.f
index 1bf65fd60..5b0965406 100644
--- a/lapack-netlib/SRC/dlaqz0.f
+++ b/lapack-netlib/SRC/dlaqz0.f
@@ -322,7 +322,7 @@
 
 *     Local scalars
       DOUBLE PRECISION :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1,
-     $                    TEMP, SWAP
+     $                    TEMP, SWAP, BNORM, BTOL
       INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS,
      $           NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED,
      $           NS, SWEEP_INFO, SHIFTPOS, LWORKREQ, K2, ISTARTM,
@@ -334,7 +334,7 @@
 *     External Functions
       EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, DLABAD,
      $            DLARTG, DROT
-      DOUBLE PRECISION, EXTERNAL :: DLAMCH
+      DOUBLE PRECISION, EXTERNAL :: DLAMCH, DLANHS
       LOGICAL, EXTERNAL :: LSAME
       INTEGER, EXTERNAL :: ILAENV
 
@@ -486,6 +486,9 @@
       ULP = DLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( DBLE( N )/ULP )
 
+      BNORM = DLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, WORK )
+      BTOL = MAX( SAFMIN, ULP*BNORM )
+
       ISTART = ILO
       ISTOP = IHI
       MAXIT = 3*( IHI-ILO+1 )
@@ -562,15 +565,8 @@
 *        slow down the method when many infinite eigenvalues are present
          K = ISTOP
          DO WHILE ( K.GE.ISTART2 )
-            TEMP = ZERO
-            IF( K .LT. ISTOP ) THEN
-               TEMP = TEMP+ABS( B( K, K+1 ) )
-            END IF
-            IF( K .GT. ISTART2 ) THEN
-               TEMP = TEMP+ABS( B( K-1, K ) )
-            END IF
 
-            IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMP ) ) THEN
+            IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
 *              A diagonal element of B is negligable, move it
 *              to the top and deflate it
                
diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f
index 15913be88..69f402914 100644
--- a/lapack-netlib/SRC/slaqz0.f
+++ b/lapack-netlib/SRC/slaqz0.f
@@ -318,7 +318,8 @@
       PARAMETER( ZERO = 0.0, ONE = 1.0, HALF = 0.5 )
 
 *     Local scalars
-      REAL :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, TEMP, SWAP
+      REAL :: SMLNUM, ULP, ESHIFT, SAFMIN, SAFMAX, C1, S1, TEMP, SWAP,
+     $        BNORM, BTOL 
       INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS,
      $           NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED,
      $           NS, SWEEP_INFO, SHIFTPOS, LWORKREQ, K2, ISTARTM,
@@ -330,7 +331,7 @@
 *     External Functions
       EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, SLABAD,
      $            SLARTG, SROT
-      REAL, EXTERNAL :: SLAMCH
+      REAL, EXTERNAL :: SLAMCH, SLANHS
       LOGICAL, EXTERNAL :: LSAME
       INTEGER, EXTERNAL :: ILAENV
 
@@ -482,6 +483,9 @@
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N )/ULP )
 
+      BNORM = SLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, WORK )
+      BTOL = MAX( SAFMIN, ULP*BNORM )
+
       ISTART = ILO
       ISTOP = IHI
       MAXIT = 3*( IHI-ILO+1 )
@@ -558,15 +562,8 @@
 *        slow down the method when many infinite eigenvalues are present
          K = ISTOP
          DO WHILE ( K.GE.ISTART2 )
-            TEMP = ZERO
-            IF( K .LT. ISTOP ) THEN
-               TEMP = TEMP+ABS( B( K, K+1 ) )
-            END IF
-            IF( K .GT. ISTART2 ) THEN
-               TEMP = TEMP+ABS( B( K-1, K ) )
-            END IF
 
-            IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMP ) ) THEN
+            IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
 *              A diagonal element of B is negligable, move it
 *              to the top and deflate it
                
diff --git a/lapack-netlib/SRC/zlaqz0.f b/lapack-netlib/SRC/zlaqz0.f
index 2616f20b5..0d8884ed5 100644
--- a/lapack-netlib/SRC/zlaqz0.f
+++ b/lapack-netlib/SRC/zlaqz0.f
@@ -300,7 +300,8 @@
       PARAMETER( ZERO = 0.0D0, ONE = 1.0D0, HALF = 0.5D0 )
 
 *     Local scalars
-      DOUBLE PRECISION :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR
+      DOUBLE PRECISION :: SMLNUM, ULP, SAFMIN, SAFMAX, C1, TEMPR,
+     $                    BNORM, BTOL
       COMPLEX*16 :: ESHIFT, S1, TEMP
       INTEGER :: ISTART, ISTOP, IITER, MAXIT, ISTART2, K, LD, NSHIFTS,
      $           NBLOCK, NW, NMIN, NIBBLE, N_UNDEFLATED, N_DEFLATED,
@@ -313,7 +314,7 @@
 *     External Functions
       EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, DLABAD,
      $            ZLARTG, ZROT
-      DOUBLE PRECISION, EXTERNAL :: DLAMCH
+      DOUBLE PRECISION, EXTERNAL :: DLAMCH, ZLANHS
       LOGICAL, EXTERNAL :: LSAME
       INTEGER, EXTERNAL :: ILAENV
 
@@ -467,6 +468,9 @@
       ULP = DLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( DBLE( N )/ULP )
 
+      BNORM = ZLANHS( 'F', IHI-ILO+1, B( ILO, ILO ), LDB, RWORK )
+      BTOL = MAX( SAFMIN, ULP*BNORM )
+
       ISTART = ILO
       ISTOP = IHI
       MAXIT = 30*( IHI-ILO+1 )
@@ -529,15 +533,8 @@
 *        slow down the method when many infinite eigenvalues are present
          K = ISTOP
          DO WHILE ( K.GE.ISTART2 )
-            TEMPR = ZERO
-            IF( K .LT. ISTOP ) THEN
-               TEMPR = TEMPR+ABS( B( K, K+1 ) )
-            END IF
-            IF( K .GT. ISTART2 ) THEN
-               TEMPR = TEMPR+ABS( B( K-1, K ) )
-            END IF
 
-            IF( ABS( B( K, K ) ) .LT. MAX( SMLNUM, ULP*TEMPR ) ) THEN
+            IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
 *              A diagonal element of B is negligable, move it
 *              to the top and deflate it
                

From 31d2145988b0b952f702d3fdbeb910f6ff8e1489 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 22:44:36 +0100
Subject: [PATCH 128/154] Set scale early for robust triangular solvers
 (Reference-LAPACK PR712)

---
 lapack-netlib/SRC/clatbs.f |  9 ++---
 lapack-netlib/SRC/clatrs.f | 79 ++++++++++++++++++++++++++++++++++----
 lapack-netlib/SRC/dlatbs.f |  2 +-
 lapack-netlib/SRC/dlatrs.f | 69 ++++++++++++++++++++++++++++++---
 lapack-netlib/SRC/slatbs.f |  2 +-
 lapack-netlib/SRC/slatrs.f | 69 ++++++++++++++++++++++++++++++---
 lapack-netlib/SRC/zlatbs.f |  9 ++---
 lapack-netlib/SRC/zlatrs.f | 79 ++++++++++++++++++++++++++++++++++----
 8 files changed, 278 insertions(+), 40 deletions(-)

diff --git a/lapack-netlib/SRC/clatbs.f b/lapack-netlib/SRC/clatbs.f
index 606f963d3..97abcadce 100644
--- a/lapack-netlib/SRC/clatbs.f
+++ b/lapack-netlib/SRC/clatbs.f
@@ -278,7 +278,7 @@
      $                   CDOTU, CLADIV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CAXPY, CSSCAL, CTBSV, SLABAD, SSCAL, XERBLA
+      EXTERNAL           CAXPY, CSSCAL, CTBSV, SSCAL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL
@@ -324,17 +324,14 @@
 *
 *     Quick return if possible
 *
+      SCALE = ONE
       IF( N.EQ.0 )
      $   RETURN
 *
 *     Determine machine dependent parameters to control overflow.
 *
-      SMLNUM = SLAMCH( 'Safe minimum' )
-      BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
-      SMLNUM = SMLNUM / SLAMCH( 'Precision' )
+      SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' )
       BIGNUM = ONE / SMLNUM
-      SCALE = ONE
 *
       IF( LSAME( NORMIN, 'N' ) ) THEN
 *
diff --git a/lapack-netlib/SRC/clatrs.f b/lapack-netlib/SRC/clatrs.f
index 946ab8068..91334b706 100644
--- a/lapack-netlib/SRC/clatrs.f
+++ b/lapack-netlib/SRC/clatrs.f
@@ -274,7 +274,7 @@
      $                   CDOTU, CLADIV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CAXPY, CSSCAL, CTRSV, SLABAD, SSCAL, XERBLA
+      EXTERNAL           CAXPY, CSSCAL, CTRSV, SSCAL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL
@@ -318,17 +318,14 @@
 *
 *     Quick return if possible
 *
+      SCALE = ONE
       IF( N.EQ.0 )
      $   RETURN
 *
 *     Determine machine dependent parameters to control overflow.
 *
-      SMLNUM = SLAMCH( 'Safe minimum' )
-      BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
-      SMLNUM = SMLNUM / SLAMCH( 'Precision' )
+      SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' )
       BIGNUM = ONE / SMLNUM
-      SCALE = ONE
 *
       IF( LSAME( NORMIN, 'N' ) ) THEN
 *
@@ -360,8 +357,74 @@
       IF( TMAX.LE.BIGNUM*HALF ) THEN
          TSCAL = ONE
       ELSE
-         TSCAL = HALF / ( SMLNUM*TMAX )
-         CALL SSCAL( N, TSCAL, CNORM, 1 )
+*
+*        Avoid NaN generation if entries in CNORM exceed the
+*        overflow threshold
+*
+         IF ( TMAX.LE.SLAMCH('Overflow') ) THEN
+*           Case 1: All entries in CNORM are valid floating-point numbers
+            TSCAL = HALF / ( SMLNUM*TMAX )
+            CALL SSCAL( N, TSCAL, CNORM, 1 )
+         ELSE
+*           Case 2: At least one column norm of A cannot be
+*           represented as a floating-point number. Find the
+*           maximum offdiagonal absolute value
+*           max( |Re(A(I,J))|, |Im(A(I,J)| ). If this entry is
+*           not +/- Infinity, use this value as TSCAL.
+            TMAX = ZERO
+            IF( UPPER ) THEN
+*
+*              A is upper triangular.
+*
+               DO J = 2, N
+                  DO I = 1, J - 1
+                     TMAX = MAX( TMAX, ABS( REAL( A( I, J ) ) ),
+     $                           ABS( AIMAG(A ( I, J ) ) ) )
+                  END DO
+               END DO
+            ELSE
+*
+*              A is lower triangular.
+*
+               DO J = 1, N - 1
+                  DO I = J + 1, N
+                     TMAX = MAX( TMAX, ABS( REAL( A( I, J ) ) ),
+     $                           ABS( AIMAG(A ( I, J ) ) ) )
+                  END DO
+               END DO
+            END IF
+*
+            IF( TMAX.LE.SLAMCH('Overflow') ) THEN
+               TSCAL = ONE / ( SMLNUM*TMAX )
+               DO J = 1, N
+                  IF( CNORM( J ).LE.SLAMCH('Overflow') ) THEN
+                     CNORM( J ) = CNORM( J )*TSCAL
+                  ELSE
+*                    Recompute the 1-norm of each column without
+*                    introducing Infinity in the summation.
+                     TSCAL = TWO * TSCAL
+                     CNORM( J ) = ZERO
+                     IF( UPPER ) THEN
+                        DO I = 1, J - 1
+                           CNORM( J ) = CNORM( J ) +
+     $                                  TSCAL * CABS2( A( I, J ) )
+                        END DO
+                     ELSE
+                        DO I = J + 1, N
+                           CNORM( J ) = CNORM( J ) +
+     $                                  TSCAL * CABS2( A( I, J ) )
+                        END DO
+                     END IF
+                     TSCAL = TSCAL * HALF
+                  END IF
+               END DO
+            ELSE
+*              At least one entry of A is not a valid floating-point
+*              entry. Rely on TRSV to propagate Inf and NaN.
+               CALL CTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 )
+               RETURN
+            END IF
+         END IF
       END IF
 *
 *     Compute a bound on the computed solution vector to see if the
diff --git a/lapack-netlib/SRC/dlatbs.f b/lapack-netlib/SRC/dlatbs.f
index 4b71d5399..6a812743b 100644
--- a/lapack-netlib/SRC/dlatbs.f
+++ b/lapack-netlib/SRC/dlatbs.f
@@ -310,6 +310,7 @@
 *
 *     Quick return if possible
 *
+      SCALE = ONE
       IF( N.EQ.0 )
      $   RETURN
 *
@@ -317,7 +318,6 @@
 *
       SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' )
       BIGNUM = ONE / SMLNUM
-      SCALE = ONE
 *
       IF( LSAME( NORMIN, 'N' ) ) THEN
 *
diff --git a/lapack-netlib/SRC/dlatrs.f b/lapack-netlib/SRC/dlatrs.f
index 43f92911d..be156bee2 100644
--- a/lapack-netlib/SRC/dlatrs.f
+++ b/lapack-netlib/SRC/dlatrs.f
@@ -264,8 +264,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            IDAMAX
-      DOUBLE PRECISION   DASUM, DDOT, DLAMCH
-      EXTERNAL           LSAME, IDAMAX, DASUM, DDOT, DLAMCH
+      DOUBLE PRECISION   DASUM, DDOT, DLAMCH, DLANGE
+      EXTERNAL           LSAME, IDAMAX, DASUM, DDOT, DLAMCH, DLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DAXPY, DSCAL, DTRSV, XERBLA
@@ -304,6 +304,7 @@
 *
 *     Quick return if possible
 *
+      SCALE = ONE
       IF( N.EQ.0 )
      $   RETURN
 *
@@ -311,7 +312,6 @@
 *
       SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' )
       BIGNUM = ONE / SMLNUM
-      SCALE = ONE
 *
       IF( LSAME( NORMIN, 'N' ) ) THEN
 *
@@ -343,8 +343,67 @@
       IF( TMAX.LE.BIGNUM ) THEN
          TSCAL = ONE
       ELSE
-         TSCAL = ONE / ( SMLNUM*TMAX )
-         CALL DSCAL( N, TSCAL, CNORM, 1 )
+*
+*        Avoid NaN generation if entries in CNORM exceed the
+*        overflow threshold
+*
+         IF( TMAX.LE.DLAMCH('Overflow') ) THEN
+*           Case 1: All entries in CNORM are valid floating-point numbers
+            TSCAL = ONE / ( SMLNUM*TMAX )
+            CALL DSCAL( N, TSCAL, CNORM, 1 )
+         ELSE
+*           Case 2: At least one column norm of A cannot be represented
+*           as floating-point number. Find the offdiagonal entry A( I, J )
+*           with the largest absolute value. If this entry is not +/- Infinity,
+*           use this value as TSCAL.
+            TMAX = ZERO
+            IF( UPPER ) THEN
+*
+*              A is upper triangular.
+*
+               DO J = 2, N
+                  TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ),
+     $                        TMAX )
+               END DO
+            ELSE
+*
+*              A is lower triangular.
+*
+               DO J = 1, N - 1
+                  TMAX = MAX( DLANGE( 'M', N-J, 1, A( J+1, J ), 1,
+     $                        SUMJ ), TMAX )
+               END DO
+            END IF
+*
+            IF( TMAX.LE.DLAMCH('Overflow') ) THEN
+               TSCAL = ONE / ( SMLNUM*TMAX )
+               DO J = 1, N
+                  IF( CNORM( J ).LE.DLAMCH('Overflow') ) THEN
+                     CNORM( J ) = CNORM( J )*TSCAL
+                  ELSE
+*                    Recompute the 1-norm without introducing Infinity
+*                    in the summation
+                     CNORM( J ) = ZERO
+                     IF( UPPER ) THEN
+                        DO I = 1, J - 1
+                           CNORM( J ) = CNORM( J ) +
+     $                                  TSCAL * ABS( A( I, J ) )
+                        END DO
+                     ELSE
+                        DO I = J + 1, N
+                           CNORM( J ) = CNORM( J ) +
+     $                                  TSCAL * ABS( A( I, J ) )
+                        END DO
+                     END IF
+                  END IF
+               END DO
+            ELSE
+*              At least one entry of A is not a valid floating-point entry.
+*              Rely on TRSV to propagate Inf and NaN.
+               CALL DTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 )
+               RETURN
+            END IF
+         END IF
       END IF
 *
 *     Compute a bound on the computed solution vector to see if the
diff --git a/lapack-netlib/SRC/slatbs.f b/lapack-netlib/SRC/slatbs.f
index 617d0b2f5..77940f8cd 100644
--- a/lapack-netlib/SRC/slatbs.f
+++ b/lapack-netlib/SRC/slatbs.f
@@ -310,6 +310,7 @@
 *
 *     Quick return if possible
 *
+      SCALE = ONE
       IF( N.EQ.0 )
      $   RETURN
 *
@@ -317,7 +318,6 @@
 *
       SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' )
       BIGNUM = ONE / SMLNUM
-      SCALE = ONE
 *
       IF( LSAME( NORMIN, 'N' ) ) THEN
 *
diff --git a/lapack-netlib/SRC/slatrs.f b/lapack-netlib/SRC/slatrs.f
index 94e0e88bc..0761d656f 100644
--- a/lapack-netlib/SRC/slatrs.f
+++ b/lapack-netlib/SRC/slatrs.f
@@ -264,8 +264,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX
-      REAL               SASUM, SDOT, SLAMCH
-      EXTERNAL           LSAME, ISAMAX, SASUM, SDOT, SLAMCH
+      REAL               SASUM, SDOT, SLAMCH, SLANGE
+      EXTERNAL           LSAME, ISAMAX, SASUM, SDOT, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SAXPY, SSCAL, STRSV, XERBLA
@@ -304,6 +304,7 @@
 *
 *     Quick return if possible
 *
+      SCALE = ONE
       IF( N.EQ.0 )
      $   RETURN
 *
@@ -311,7 +312,6 @@
 *
       SMLNUM = SLAMCH( 'Safe minimum' ) / SLAMCH( 'Precision' )
       BIGNUM = ONE / SMLNUM
-      SCALE = ONE
 *
       IF( LSAME( NORMIN, 'N' ) ) THEN
 *
@@ -343,8 +343,67 @@
       IF( TMAX.LE.BIGNUM ) THEN
          TSCAL = ONE
       ELSE
-         TSCAL = ONE / ( SMLNUM*TMAX )
-         CALL SSCAL( N, TSCAL, CNORM, 1 )
+*
+*        Avoid NaN generation if entries in CNORM exceed the
+*        overflow threshold
+*
+         IF ( TMAX.LE.SLAMCH('Overflow') ) THEN
+*           Case 1: All entries in CNORM are valid floating-point numbers
+            TSCAL = ONE / ( SMLNUM*TMAX )
+            CALL SSCAL( N, TSCAL, CNORM, 1 )
+         ELSE
+*           Case 2: At least one column norm of A cannot be represented
+*           as floating-point number. Find the offdiagonal entry A( I, J )
+*           with the largest absolute value. If this entry is not +/- Infinity,
+*           use this value as TSCAL.
+            TMAX = ZERO
+            IF( UPPER ) THEN
+*
+*              A is upper triangular.
+*
+               DO J = 2, N
+                  TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ),
+     $                        TMAX )
+               END DO
+            ELSE
+*
+*              A is lower triangular.
+*
+               DO J = 1, N - 1
+                  TMAX = MAX( SLANGE( 'M', N-J, 1, A( J+1, J ), 1,
+     $                        SUMJ ), TMAX )
+               END DO
+            END IF
+*
+            IF( TMAX.LE.SLAMCH('Overflow') ) THEN
+               TSCAL = ONE / ( SMLNUM*TMAX )
+               DO J = 1, N
+                  IF( CNORM( J ).LE.SLAMCH('Overflow') ) THEN
+                     CNORM( J ) = CNORM( J )*TSCAL
+                  ELSE
+*                    Recompute the 1-norm without introducing Infinity
+*                    in the summation
+                     CNORM( J ) = ZERO
+                     IF( UPPER ) THEN
+                        DO I = 1, J - 1
+                           CNORM( J ) = CNORM( J ) +
+     $                                  TSCAL * ABS( A( I, J ) )
+                        END DO
+                     ELSE
+                        DO I = J + 1, N
+                           CNORM( J ) = CNORM( J ) +
+     $                                  TSCAL * ABS( A( I, J ) )
+                        END DO
+                     END IF
+                  END IF
+               END DO
+            ELSE
+*              At least one entry of A is not a valid floating-point entry.
+*              Rely on TRSV to propagate Inf and NaN.
+               CALL STRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 )
+               RETURN
+            END IF
+         END IF
       END IF
 *
 *     Compute a bound on the computed solution vector to see if the
diff --git a/lapack-netlib/SRC/zlatbs.f b/lapack-netlib/SRC/zlatbs.f
index b7b2cb8ae..bdffa1ea9 100644
--- a/lapack-netlib/SRC/zlatbs.f
+++ b/lapack-netlib/SRC/zlatbs.f
@@ -278,7 +278,7 @@
      $                   ZDOTU, ZLADIV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTBSV, DLABAD
+      EXTERNAL           DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTBSV
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN
@@ -324,17 +324,14 @@
 *
 *     Quick return if possible
 *
+      SCALE = ONE
       IF( N.EQ.0 )
      $   RETURN
 *
 *     Determine machine dependent parameters to control overflow.
 *
-      SMLNUM = DLAMCH( 'Safe minimum' )
-      BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
-      SMLNUM = SMLNUM / DLAMCH( 'Precision' )
+      SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' )
       BIGNUM = ONE / SMLNUM
-      SCALE = ONE
 *
       IF( LSAME( NORMIN, 'N' ) ) THEN
 *
diff --git a/lapack-netlib/SRC/zlatrs.f b/lapack-netlib/SRC/zlatrs.f
index 91bb688ec..2276ace87 100644
--- a/lapack-netlib/SRC/zlatrs.f
+++ b/lapack-netlib/SRC/zlatrs.f
@@ -274,7 +274,7 @@
      $                   ZDOTU, ZLADIV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTRSV, DLABAD
+      EXTERNAL           DSCAL, XERBLA, ZAXPY, ZDSCAL, ZTRSV
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN
@@ -318,17 +318,14 @@
 *
 *     Quick return if possible
 *
+      SCALE = ONE
       IF( N.EQ.0 )
      $   RETURN
 *
 *     Determine machine dependent parameters to control overflow.
 *
-      SMLNUM = DLAMCH( 'Safe minimum' )
-      BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
-      SMLNUM = SMLNUM / DLAMCH( 'Precision' )
+      SMLNUM = DLAMCH( 'Safe minimum' ) / DLAMCH( 'Precision' )
       BIGNUM = ONE / SMLNUM
-      SCALE = ONE
 *
       IF( LSAME( NORMIN, 'N' ) ) THEN
 *
@@ -360,8 +357,74 @@
       IF( TMAX.LE.BIGNUM*HALF ) THEN
          TSCAL = ONE
       ELSE
-         TSCAL = HALF / ( SMLNUM*TMAX )
-         CALL DSCAL( N, TSCAL, CNORM, 1 )
+*
+*        Avoid NaN generation if entries in CNORM exceed the
+*        overflow threshold
+*
+         IF ( TMAX.LE.DLAMCH('Overflow') ) THEN
+*           Case 1: All entries in CNORM are valid floating-point numbers
+            TSCAL = HALF / ( SMLNUM*TMAX )
+            CALL DSCAL( N, TSCAL, CNORM, 1 )
+         ELSE
+*           Case 2: At least one column norm of A cannot be
+*           represented as a floating-point number. Find the
+*           maximum offdiagonal absolute value
+*           max( |Re(A(I,J))|, |Im(A(I,J)| ). If this entry is
+*           not +/- Infinity, use this value as TSCAL.
+            TMAX = ZERO
+            IF( UPPER ) THEN
+*
+*              A is upper triangular.
+*
+               DO J = 2, N
+                  DO I = 1, J - 1
+                     TMAX = MAX( TMAX, ABS( DBLE( A( I, J ) ) ),
+     $                           ABS( DIMAG(A ( I, J ) ) ) )
+                  END DO
+               END DO
+            ELSE
+*
+*              A is lower triangular.
+*
+               DO J = 1, N - 1
+                  DO I = J + 1, N
+                     TMAX = MAX( TMAX, ABS( DBLE( A( I, J ) ) ),
+     $                           ABS( DIMAG(A ( I, J ) ) ) )
+                  END DO
+               END DO
+            END IF
+*
+            IF( TMAX.LE.DLAMCH('Overflow') ) THEN
+               TSCAL = ONE / ( SMLNUM*TMAX )
+               DO J = 1, N
+                  IF( CNORM( J ).LE.DLAMCH('Overflow') ) THEN
+                     CNORM( J ) = CNORM( J )*TSCAL
+                  ELSE
+*                    Recompute the 1-norm of each column without
+*                    introducing Infinity in the summation.
+                     TSCAL = TWO * TSCAL
+                     CNORM( J ) = ZERO
+                     IF( UPPER ) THEN
+                        DO I = 1, J - 1
+                           CNORM( J ) = CNORM( J ) +
+     $                                  TSCAL * CABS2( A( I, J ) )
+                        END DO
+                     ELSE
+                        DO I = J + 1, N
+                           CNORM( J ) = CNORM( J ) +
+     $                                  TSCAL * CABS2( A( I, J ) )
+                        END DO
+                     END IF
+                     TSCAL = TSCAL * HALF
+                  END IF
+               END DO
+            ELSE
+*              At least one entry of A is not a valid floating-point
+*              entry. Rely on TRSV to propagate Inf and NaN.
+               CALL ZTRSV( UPLO, TRANS, DIAG, N, A, LDA, X, 1 )
+               RETURN
+            END IF
+         END IF
       END IF
 *
 *     Compute a bound on the computed solution vector to see if the

From e00f0fb26ac5ea52120db315b9c439515fd16572 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 22:46:58 +0100
Subject: [PATCH 129/154] Fix function documentation (Reference-LAPACK PR747)

---
 lapack-netlib/SRC/clarscl2.f | 10 +++++-----
 lapack-netlib/SRC/clascl2.f  | 12 ++++++------
 lapack-netlib/SRC/dlarscl2.f | 10 +++++-----
 lapack-netlib/SRC/dlascl2.f  | 10 +++++-----
 lapack-netlib/SRC/slarscl2.f | 10 +++++-----
 lapack-netlib/SRC/slascl2.f  | 10 +++++-----
 lapack-netlib/SRC/zlarscl2.f | 10 +++++-----
 lapack-netlib/SRC/zlascl2.f  | 10 +++++-----
 8 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/lapack-netlib/SRC/clarscl2.f b/lapack-netlib/SRC/clarscl2.f
index 26b028dbb..f4e68523b 100644
--- a/lapack-netlib/SRC/clarscl2.f
+++ b/lapack-netlib/SRC/clarscl2.f
@@ -1,4 +1,4 @@
-*> \brief \b CLARSCL2 performs reciprocal diagonal scaling on a vector.
+*> \brief \b CLARSCL2 performs reciprocal diagonal scaling on a matrix.
 *
 *  =========== DOCUMENTATION ===========
 *
@@ -34,7 +34,7 @@
 *>
 *> \verbatim
 *>
-*> CLARSCL2 performs a reciprocal diagonal scaling on an vector:
+*> CLARSCL2 performs a reciprocal diagonal scaling on a matrix:
 *>   x <-- inv(D) * x
 *> where the REAL diagonal matrix D is stored as a vector.
 *>
@@ -66,14 +66,14 @@
 *> \param[in,out] X
 *> \verbatim
 *>          X is COMPLEX array, dimension (LDX,N)
-*>     On entry, the vector X to be scaled by D.
-*>     On exit, the scaled vector.
+*>     On entry, the matrix X to be scaled by D.
+*>     On exit, the scaled matrix.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
-*>     The leading dimension of the vector X. LDX >= M.
+*>     The leading dimension of the matrix X. LDX >= M.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/clascl2.f b/lapack-netlib/SRC/clascl2.f
index 2ae27975c..882273b5e 100644
--- a/lapack-netlib/SRC/clascl2.f
+++ b/lapack-netlib/SRC/clascl2.f
@@ -1,4 +1,4 @@
-*> \brief \b CLASCL2 performs diagonal scaling on a vector.
+*> \brief \b CLASCL2 performs diagonal scaling on a matrix.
 *
 *  =========== DOCUMENTATION ===========
 *
@@ -34,9 +34,9 @@
 *>
 *> \verbatim
 *>
-*> CLASCL2 performs a diagonal scaling on a vector:
+*> CLASCL2 performs a diagonal scaling on a matrix:
 *>   x <-- D * x
-*> where the diagonal REAL matrix D is stored as a vector.
+*> where the diagonal REAL matrix D is stored as a matrix.
 *>
 *> Eventually to be replaced by BLAS_cge_diag_scale in the new BLAS
 *> standard.
@@ -66,14 +66,14 @@
 *> \param[in,out] X
 *> \verbatim
 *>          X is COMPLEX array, dimension (LDX,N)
-*>     On entry, the vector X to be scaled by D.
-*>     On exit, the scaled vector.
+*>     On entry, the matrix X to be scaled by D.
+*>     On exit, the scaled matrix.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
-*>     The leading dimension of the vector X. LDX >= M.
+*>     The leading dimension of the matrix X. LDX >= M.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dlarscl2.f b/lapack-netlib/SRC/dlarscl2.f
index 2468e2702..cc4b9aa3c 100644
--- a/lapack-netlib/SRC/dlarscl2.f
+++ b/lapack-netlib/SRC/dlarscl2.f
@@ -1,4 +1,4 @@
-*> \brief \b DLARSCL2 performs reciprocal diagonal scaling on a vector.
+*> \brief \b DLARSCL2 performs reciprocal diagonal scaling on a matrix.
 *
 *  =========== DOCUMENTATION ===========
 *
@@ -33,7 +33,7 @@
 *>
 *> \verbatim
 *>
-*> DLARSCL2 performs a reciprocal diagonal scaling on an vector:
+*> DLARSCL2 performs a reciprocal diagonal scaling on a matrix:
 *>   x <-- inv(D) * x
 *> where the diagonal matrix D is stored as a vector.
 *>
@@ -65,14 +65,14 @@
 *> \param[in,out] X
 *> \verbatim
 *>          X is DOUBLE PRECISION array, dimension (LDX,N)
-*>     On entry, the vector X to be scaled by D.
-*>     On exit, the scaled vector.
+*>     On entry, the matrix X to be scaled by D.
+*>     On exit, the scaled matrix.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
-*>     The leading dimension of the vector X. LDX >= M.
+*>     The leading dimension of the matrix X. LDX >= M.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dlascl2.f b/lapack-netlib/SRC/dlascl2.f
index 901e43c49..568e296ad 100644
--- a/lapack-netlib/SRC/dlascl2.f
+++ b/lapack-netlib/SRC/dlascl2.f
@@ -1,4 +1,4 @@
-*> \brief \b DLASCL2 performs diagonal scaling on a vector.
+*> \brief \b DLASCL2 performs diagonal scaling on a matrix.
 *
 *  =========== DOCUMENTATION ===========
 *
@@ -33,7 +33,7 @@
 *>
 *> \verbatim
 *>
-*> DLASCL2 performs a diagonal scaling on a vector:
+*> DLASCL2 performs a diagonal scaling on a matrix:
 *>   x <-- D * x
 *> where the diagonal matrix D is stored as a vector.
 *>
@@ -65,14 +65,14 @@
 *> \param[in,out] X
 *> \verbatim
 *>          X is DOUBLE PRECISION array, dimension (LDX,N)
-*>     On entry, the vector X to be scaled by D.
-*>     On exit, the scaled vector.
+*>     On entry, the matrix X to be scaled by D.
+*>     On exit, the scaled matrix.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
-*>     The leading dimension of the vector X. LDX >= M.
+*>     The leading dimension of the matrix X. LDX >= M.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/slarscl2.f b/lapack-netlib/SRC/slarscl2.f
index 5726f12cd..c7b77c908 100644
--- a/lapack-netlib/SRC/slarscl2.f
+++ b/lapack-netlib/SRC/slarscl2.f
@@ -1,4 +1,4 @@
-*> \brief \b SLARSCL2 performs reciprocal diagonal scaling on a vector.
+*> \brief \b SLARSCL2 performs reciprocal diagonal scaling on a matrix.
 *
 *  =========== DOCUMENTATION ===========
 *
@@ -33,7 +33,7 @@
 *>
 *> \verbatim
 *>
-*> SLARSCL2 performs a reciprocal diagonal scaling on an vector:
+*> SLARSCL2 performs a reciprocal diagonal scaling on a matrix:
 *>   x <-- inv(D) * x
 *> where the diagonal matrix D is stored as a vector.
 *>
@@ -65,14 +65,14 @@
 *> \param[in,out] X
 *> \verbatim
 *>          X is REAL array, dimension (LDX,N)
-*>     On entry, the vector X to be scaled by D.
-*>     On exit, the scaled vector.
+*>     On entry, the matrix X to be scaled by D.
+*>     On exit, the scaled matrix.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
-*>     The leading dimension of the vector X. LDX >= M.
+*>     The leading dimension of the matrix X. LDX >= M.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/slascl2.f b/lapack-netlib/SRC/slascl2.f
index 07b506a8c..5efc1cfcd 100644
--- a/lapack-netlib/SRC/slascl2.f
+++ b/lapack-netlib/SRC/slascl2.f
@@ -1,4 +1,4 @@
-*> \brief \b SLASCL2 performs diagonal scaling on a vector.
+*> \brief \b SLASCL2 performs diagonal scaling on a matrix.
 *
 *  =========== DOCUMENTATION ===========
 *
@@ -33,7 +33,7 @@
 *>
 *> \verbatim
 *>
-*> SLASCL2 performs a diagonal scaling on a vector:
+*> SLASCL2 performs a diagonal scaling on a matrix:
 *>   x <-- D * x
 *> where the diagonal matrix D is stored as a vector.
 *>
@@ -65,14 +65,14 @@
 *> \param[in,out] X
 *> \verbatim
 *>          X is REAL array, dimension (LDX,N)
-*>     On entry, the vector X to be scaled by D.
-*>     On exit, the scaled vector.
+*>     On entry, the matrix X to be scaled by D.
+*>     On exit, the scaled matrix.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
-*>     The leading dimension of the vector X. LDX >= M.
+*>     The leading dimension of the matrix X. LDX >= M.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zlarscl2.f b/lapack-netlib/SRC/zlarscl2.f
index 4a1e1603a..e61865906 100644
--- a/lapack-netlib/SRC/zlarscl2.f
+++ b/lapack-netlib/SRC/zlarscl2.f
@@ -1,4 +1,4 @@
-*> \brief \b ZLARSCL2 performs reciprocal diagonal scaling on a vector.
+*> \brief \b ZLARSCL2 performs reciprocal diagonal scaling on a matrix.
 *
 *  =========== DOCUMENTATION ===========
 *
@@ -34,7 +34,7 @@
 *>
 *> \verbatim
 *>
-*> ZLARSCL2 performs a reciprocal diagonal scaling on an vector:
+*> ZLARSCL2 performs a reciprocal diagonal scaling on a matrix:
 *>   x <-- inv(D) * x
 *> where the DOUBLE PRECISION diagonal matrix D is stored as a vector.
 *>
@@ -66,14 +66,14 @@
 *> \param[in,out] X
 *> \verbatim
 *>          X is COMPLEX*16 array, dimension (LDX,N)
-*>     On entry, the vector X to be scaled by D.
-*>     On exit, the scaled vector.
+*>     On entry, the matrix X to be scaled by D.
+*>     On exit, the scaled matrix.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
-*>     The leading dimension of the vector X. LDX >= M.
+*>     The leading dimension of the matrix X. LDX >= M.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zlascl2.f b/lapack-netlib/SRC/zlascl2.f
index c4e6992fb..26406c363 100644
--- a/lapack-netlib/SRC/zlascl2.f
+++ b/lapack-netlib/SRC/zlascl2.f
@@ -1,4 +1,4 @@
-*> \brief \b ZLASCL2 performs diagonal scaling on a vector.
+*> \brief \b ZLASCL2 performs diagonal scaling on a matrix.
 *
 *  =========== DOCUMENTATION ===========
 *
@@ -34,7 +34,7 @@
 *>
 *> \verbatim
 *>
-*> ZLASCL2 performs a diagonal scaling on a vector:
+*> ZLASCL2 performs a diagonal scaling on a matrix:
 *>   x <-- D * x
 *> where the DOUBLE PRECISION diagonal matrix D is stored as a vector.
 *>
@@ -66,14 +66,14 @@
 *> \param[in,out] X
 *> \verbatim
 *>          X is COMPLEX*16 array, dimension (LDX,N)
-*>     On entry, the vector X to be scaled by D.
-*>     On exit, the scaled vector.
+*>     On entry, the matrix X to be scaled by D.
+*>     On exit, the scaled matrix.
 *> \endverbatim
 *>
 *> \param[in] LDX
 *> \verbatim
 *>          LDX is INTEGER
-*>     The leading dimension of the vector X. LDX >= M.
+*>     The leading dimension of the matrix X. LDX >= M.
 *> \endverbatim
 *
 *  Authors:

From 7ae4269add1be1bff371c4e9d4d175ba7c630085 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 20 Nov 2022 22:52:28 +0100
Subject: [PATCH 130/154] Use new algorithms for computing Givens rotations
 (Reference-LAPACK PR631)

---
 lapack-netlib/SRC/clartg.f90 | 159 +++++++++++++++++++++++-----------
 lapack-netlib/SRC/dlartg.f90 |  30 +++----
 lapack-netlib/SRC/slartg.f90 |  30 +++----
 lapack-netlib/SRC/zlartg.f90 | 161 ++++++++++++++++++++++++-----------
 4 files changed, 247 insertions(+), 133 deletions(-)

diff --git a/lapack-netlib/SRC/clartg.f90 b/lapack-netlib/SRC/clartg.f90
index 13a629a34..6231f8520 100644
--- a/lapack-netlib/SRC/clartg.f90
+++ b/lapack-netlib/SRC/clartg.f90
@@ -30,7 +30,7 @@
 !> The mathematical formulas used for C and S are
 !>
 !>    sgn(x) = {  x / |x|,   x != 0
-!>             {  1,         x = 0
+!>             {  1,         x  = 0
 !>
 !>    R = sgn(F) * sqrt(|F|**2 + |G|**2)
 !>
@@ -38,19 +38,20 @@
 !>
 !>    S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2)
 !>
+!> Special conditions:
+!>    If G=0, then C=1 and S=0.
+!>    If F=0, then C=0 and S is chosen so that R is real.
+!>
 !> When F and G are real, the formulas simplify to C = F/R and
 !> S = G/R, and the returned values of C, S, and R should be
-!> identical to those returned by CLARTG.
+!> identical to those returned by SLARTG.
 !>
 !> The algorithm used to compute these quantities incorporates scaling
 !> to avoid overflow or underflow in computing the square root of the
 !> sum of squares.
 !>
-!> This is a faster version of the BLAS1 routine CROTG, except for
-!> the following differences:
-!>    F and G are unchanged on return.
-!>    If G=0, then C=1 and S=0.
-!>    If F=0, then C=0 and S is chosen so that R is real.
+!> This is the same routine CROTG fom BLAS1, except that
+!> F and G are unchanged on return.
 !>
 !> Below, wp=>sp stands for single precision from LA_CONSTANTS module.
 !> \endverbatim
@@ -91,22 +92,19 @@
 !  Authors:
 !  ========
 !
-!> \author Edward Anderson, Lockheed Martin
+!> \author Weslley Pereira, University of Colorado Denver, USA
 !
-!> \date August 2016
+!> \date December 2021
 !
 !> \ingroup OTHERauxiliary
 !
-!> \par Contributors:
-!  ==================
-!>
-!> Weslley Pereira, University of Colorado Denver, USA
-!
 !> \par Further Details:
 !  =====================
 !>
 !> \verbatim
 !>
+!> Based on the algorithm from
+!>
 !>  Anderson E. (2017)
 !>  Algorithm 978: Safe Scaling in the Level 1 BLAS
 !>  ACM Trans Math Softw 44:1--28
@@ -117,7 +115,7 @@
 subroutine CLARTG( f, g, c, s, r )
    use LA_CONSTANTS, &
    only: wp=>sp, zero=>szero, one=>sone, two=>stwo, czero, &
-         rtmin=>srtmin, rtmax=>srtmax, safmin=>ssafmin, safmax=>ssafmax
+         safmin=>ssafmin, safmax=>ssafmax
 !
 !  -- LAPACK auxiliary routine --
 !  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -129,7 +127,7 @@ subroutine CLARTG( f, g, c, s, r )
    complex(wp)        f, g, r, s
 !  ..
 !  .. Local Scalars ..
-   real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w
+   real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmin, rtmax
    complex(wp) :: fs, gs, t
 !  ..
 !  .. Intrinsic Functions ..
@@ -141,6 +139,9 @@ subroutine CLARTG( f, g, c, s, r )
 !  .. Statement Function definitions ..
    ABSSQ( t ) = real( t )**2 + aimag( t )**2
 !  ..
+!  .. Constants ..
+   rtmin = sqrt( safmin )
+!  ..
 !  .. Executable Statements ..
 !
    if( g == czero ) then
@@ -149,30 +150,43 @@ subroutine CLARTG( f, g, c, s, r )
       r = f
    else if( f == czero ) then
       c = zero
-      g1 = max( abs(real(g)), abs(aimag(g)) )
-      if( g1 > rtmin .and. g1 < rtmax ) then
+      if( real(g) == zero ) then
+         r = abs(aimag(g))
+         s = conjg( g ) / r
+      elseif( aimag(g) == zero ) then
+         r = abs(real(g))
+         s = conjg( g ) / r
+      else
+         g1 = max( abs(real(g)), abs(aimag(g)) )
+         rtmax = sqrt( safmax/2 )
+         if( g1 > rtmin .and. g1 < rtmax ) then
 !
 !        Use unscaled algorithm
 !
-         g2 = ABSSQ( g )
-         d = sqrt( g2 )
-         s = conjg( g ) / d
-         r = d
-      else
+!           The following two lines can be replaced by `d = abs( g )`.
+!           This algorithm do not use the intrinsic complex abs.
+            g2 = ABSSQ( g )
+            d = sqrt( g2 )
+            s = conjg( g ) / d
+            r = d
+         else
 !
 !        Use scaled algorithm
 !
-         u = min( safmax, max( safmin, g1 ) )
-         uu = one / u
-         gs = g*uu
-         g2 = ABSSQ( gs )
-         d = sqrt( g2 )
-         s = conjg( gs ) / d
-         r = d*u
+            u = min( safmax, max( safmin, g1 ) )
+            gs = g / u
+!           The following two lines can be replaced by `d = abs( gs )`.
+!           This algorithm do not use the intrinsic complex abs.
+            g2 = ABSSQ( gs )
+            d = sqrt( g2 )
+            s = conjg( gs ) / d
+            r = d*u
+         end if
       end if
    else
       f1 = max( abs(real(f)), abs(aimag(f)) )
       g1 = max( abs(real(g)), abs(aimag(g)) )
+      rtmax = sqrt( safmax/4 )
       if( f1 > rtmin .and. f1 < rtmax .and. &
           g1 > rtmin .and. g1 < rtmax ) then
 !
@@ -181,32 +195,51 @@ subroutine CLARTG( f, g, c, s, r )
          f2 = ABSSQ( f )
          g2 = ABSSQ( g )
          h2 = f2 + g2
-         if( f2 > rtmin .and. h2 < rtmax ) then
-            d = sqrt( f2*h2 )
+         ! safmin <= f2 <= h2 <= safmax 
+         if( f2 >= h2 * safmin ) then
+            ! safmin <= f2/h2 <= 1, and h2/f2 is finite
+            c = sqrt( f2 / h2 )
+            r = f / c
+            rtmax = rtmax * 2
+            if( f2 > rtmin .and. h2 < rtmax ) then
+               ! safmin <= sqrt( f2*h2 ) <= safmax
+               s = conjg( g ) * ( f / sqrt( f2*h2 ) )
+            else
+               s = conjg( g ) * ( r / h2 )
+            end if
          else
-            d = sqrt( f2 )*sqrt( h2 )
+            ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow.
+            ! Moreover,
+            !  safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax,
+            !  sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax).
+            ! Also,
+            !  g2 >> f2, which means that h2 = g2.
+            d = sqrt( f2 * h2 )
+            c = f2 / d
+            if( c >= safmin ) then
+               r = f / c
+            else
+               ! f2 / sqrt(f2 * h2) < safmin, then
+               !  sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax
+               r = f * ( h2 / d )
+            end if
+            s = conjg( g ) * ( f / d )
          end if
-         p = 1 / d
-         c = f2*p
-         s = conjg( g )*( f*p )
-         r = f*( h2*p )
       else
 !
 !        Use scaled algorithm
 !
          u = min( safmax, max( safmin, f1, g1 ) )
-         uu = one / u
-         gs = g*uu
+         gs = g / u
          g2 = ABSSQ( gs )
-         if( f1*uu < rtmin ) then
+         if( f1 / u < rtmin ) then
 !
 !           f is not well-scaled when scaled by g1.
 !           Use a different scaling for f.
 !
             v = min( safmax, max( safmin, f1 ) )
-            vv = one / v
-            w = v * uu
-            fs = f*vv
+            w = v / u
+            fs = f / v
             f2 = ABSSQ( fs )
             h2 = f2*w**2 + g2
          else
@@ -214,19 +247,43 @@ subroutine CLARTG( f, g, c, s, r )
 !           Otherwise use the same scaling for f and g.
 !
             w = one
-            fs = f*uu
+            fs = f / u
             f2 = ABSSQ( fs )
             h2 = f2 + g2
          end if
-         if( f2 > rtmin .and. h2 < rtmax ) then
-            d = sqrt( f2*h2 )
+         ! safmin <= f2 <= h2 <= safmax 
+         if( f2 >= h2 * safmin ) then
+            ! safmin <= f2/h2 <= 1, and h2/f2 is finite
+            c = sqrt( f2 / h2 )
+            r = fs / c
+            rtmax = rtmax * 2
+            if( f2 > rtmin .and. h2 < rtmax ) then
+               ! safmin <= sqrt( f2*h2 ) <= safmax
+               s = conjg( gs ) * ( fs / sqrt( f2*h2 ) )
+            else
+               s = conjg( gs ) * ( r / h2 )
+            end if
          else
-            d = sqrt( f2 )*sqrt( h2 )
+            ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow.
+            ! Moreover,
+            !  safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax,
+            !  sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax).
+            ! Also,
+            !  g2 >> f2, which means that h2 = g2.
+            d = sqrt( f2 * h2 )
+            c = f2 / d
+            if( c >= safmin ) then
+               r = fs / c
+            else
+               ! f2 / sqrt(f2 * h2) < safmin, then
+               !  sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax
+               r = fs * ( h2 / d )
+            end if
+            s = conjg( gs ) * ( fs / d )
          end if
-         p = 1 / d
-         c = ( f2*p )*w
-         s = conjg( gs )*( fs*p )
-         r = ( fs*( h2*p ) )*u
+         ! Rescale c and r
+         c = c * w
+         r = r * u
       end if
    end if
    return
diff --git a/lapack-netlib/SRC/dlartg.f90 b/lapack-netlib/SRC/dlartg.f90
index ef8c6e386..b7049c32f 100644
--- a/lapack-netlib/SRC/dlartg.f90
+++ b/lapack-netlib/SRC/dlartg.f90
@@ -11,7 +11,7 @@
 !       SUBROUTINE DLARTG( F, G, C, S, R )
 !
 !       .. Scalar Arguments ..
-!       REAL(wp)      C, F, G, R, S
+!       REAL(wp)          C, F, G, R, S
 !       ..
 !
 !> \par Purpose:
@@ -45,8 +45,6 @@
 !>       floating point operations (saves work in DBDSQR when
 !>       there are zeros on the diagonal).
 !>
-!> If F exceeds G in magnitude, C will be positive.
-!>
 !> Below, wp=>dp stands for double precision from LA_CONSTANTS module.
 !> \endverbatim
 !
@@ -112,7 +110,7 @@
 subroutine DLARTG( f, g, c, s, r )
    use LA_CONSTANTS, &
    only: wp=>dp, zero=>dzero, half=>dhalf, one=>done, &
-         rtmin=>drtmin, rtmax=>drtmax, safmin=>dsafmin, safmax=>dsafmax
+         safmin=>dsafmin, safmax=>dsafmax
 !
 !  -- LAPACK auxiliary routine --
 !  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -123,11 +121,15 @@ subroutine DLARTG( f, g, c, s, r )
    real(wp) :: c, f, g, r, s
 !  ..
 !  .. Local Scalars ..
-   real(wp) :: d, f1, fs, g1, gs, p, u, uu
+   real(wp) :: d, f1, fs, g1, gs, u, rtmin, rtmax
 !  ..
 !  .. Intrinsic Functions ..
    intrinsic :: abs, sign, sqrt
 !  ..
+!  .. Constants ..
+   rtmin = sqrt( safmin )
+   rtmax = sqrt( safmax/2 )
+!  ..
 !  .. Executable Statements ..
 !
    f1 = abs( f )
@@ -143,20 +145,18 @@ subroutine DLARTG( f, g, c, s, r )
    else if( f1 > rtmin .and. f1 < rtmax .and. &
             g1 > rtmin .and. g1 < rtmax ) then
       d = sqrt( f*f + g*g )
-      p = one / d
-      c = f1*p
-      s = g*sign( p, f )
+      c = f1 / d
       r = sign( d, f )
+      s = g / r
    else
       u = min( safmax, max( safmin, f1, g1 ) )
-      uu = one / u
-      fs = f*uu
-      gs = g*uu
+      fs = f / u
+      gs = g / u
       d = sqrt( fs*fs + gs*gs )
-      p = one / d
-      c = abs( fs )*p
-      s = gs*sign( p, f )
-      r = sign( d, f )*u
+      c = abs( fs ) / d
+      r = sign( d, f )
+      s = gs / r
+      r = r*u
    end if
    return
 end subroutine
diff --git a/lapack-netlib/SRC/slartg.f90 b/lapack-netlib/SRC/slartg.f90
index a9af1aa8d..8a5a8f26a 100644
--- a/lapack-netlib/SRC/slartg.f90
+++ b/lapack-netlib/SRC/slartg.f90
@@ -35,7 +35,7 @@
 !> square root of the sum of squares.
 !>
 !> This version is discontinuous in R at F = 0 but it returns the same
-!> C and S as SLARTG for complex inputs (F,0) and (G,0).
+!> C and S as CLARTG for complex inputs (F,0) and (G,0).
 !>
 !> This is a more accurate version of the BLAS1 routine SROTG,
 !> with the following other differences:
@@ -45,8 +45,6 @@
 !>       floating point operations (saves work in SBDSQR when
 !>       there are zeros on the diagonal).
 !>
-!> If F exceeds G in magnitude, C will be positive.
-!>
 !> Below, wp=>sp stands for single precision from LA_CONSTANTS module.
 !> \endverbatim
 !
@@ -112,7 +110,7 @@
 subroutine SLARTG( f, g, c, s, r )
    use LA_CONSTANTS, &
    only: wp=>sp, zero=>szero, half=>shalf, one=>sone, &
-         rtmin=>srtmin, rtmax=>srtmax, safmin=>ssafmin, safmax=>ssafmax
+         safmin=>ssafmin, safmax=>ssafmax
 !
 !  -- LAPACK auxiliary routine --
 !  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -123,11 +121,15 @@ subroutine SLARTG( f, g, c, s, r )
    real(wp) :: c, f, g, r, s
 !  ..
 !  .. Local Scalars ..
-   real(wp) :: d, f1, fs, g1, gs, p, u, uu
+   real(wp) :: d, f1, fs, g1, gs, u, rtmin, rtmax
 !  ..
 !  .. Intrinsic Functions ..
    intrinsic :: abs, sign, sqrt
 !  ..
+!  .. Constants ..
+   rtmin = sqrt( safmin )
+   rtmax = sqrt( safmax/2 )
+!  ..
 !  .. Executable Statements ..
 !
    f1 = abs( f )
@@ -143,20 +145,18 @@ subroutine SLARTG( f, g, c, s, r )
    else if( f1 > rtmin .and. f1 < rtmax .and. &
             g1 > rtmin .and. g1 < rtmax ) then
       d = sqrt( f*f + g*g )
-      p = one / d
-      c = f1*p
-      s = g*sign( p, f )
+      c = f1 / d
       r = sign( d, f )
+      s = g / r
    else
       u = min( safmax, max( safmin, f1, g1 ) )
-      uu = one / u
-      fs = f*uu
-      gs = g*uu
+      fs = f / u
+      gs = g / u
       d = sqrt( fs*fs + gs*gs )
-      p = one / d
-      c = abs( fs )*p
-      s = gs*sign( p, f )
-      r = sign( d, f )*u
+      c = abs( fs ) / d
+      r = sign( d, f )
+      s = gs / r
+      r = r*u
    end if
    return
 end subroutine
diff --git a/lapack-netlib/SRC/zlartg.f90 b/lapack-netlib/SRC/zlartg.f90
index 337a4dda8..a4f9bd4b0 100644
--- a/lapack-netlib/SRC/zlartg.f90
+++ b/lapack-netlib/SRC/zlartg.f90
@@ -11,8 +11,8 @@
 !       SUBROUTINE ZLARTG( F, G, C, S, R )
 !
 !       .. Scalar Arguments ..
-!       REAL(wp)           C
-!       COMPLEX(wp)        F, G, R, S
+!       REAL(wp)              C
+!       COMPLEX(wp)           F, G, R, S
 !       ..
 !
 !> \par Purpose:
@@ -30,7 +30,7 @@
 !> The mathematical formulas used for C and S are
 !>
 !>    sgn(x) = {  x / |x|,   x != 0
-!>             {  1,         x = 0
+!>             {  1,         x  = 0
 !>
 !>    R = sgn(F) * sqrt(|F|**2 + |G|**2)
 !>
@@ -38,6 +38,10 @@
 !>
 !>    S = sgn(F) * conjg(G) / sqrt(|F|**2 + |G|**2)
 !>
+!> Special conditions:
+!>    If G=0, then C=1 and S=0.
+!>    If F=0, then C=0 and S is chosen so that R is real.
+!>
 !> When F and G are real, the formulas simplify to C = F/R and
 !> S = G/R, and the returned values of C, S, and R should be
 !> identical to those returned by DLARTG.
@@ -46,11 +50,8 @@
 !> to avoid overflow or underflow in computing the square root of the
 !> sum of squares.
 !>
-!> This is a faster version of the BLAS1 routine ZROTG, except for
-!> the following differences:
-!>    F and G are unchanged on return.
-!>    If G=0, then C=1 and S=0.
-!>    If F=0, then C=0 and S is chosen so that R is real.
+!> This is the same routine ZROTG fom BLAS1, except that
+!> F and G are unchanged on return.
 !>
 !> Below, wp=>dp stands for double precision from LA_CONSTANTS module.
 !> \endverbatim
@@ -91,22 +92,19 @@
 !  Authors:
 !  ========
 !
-!> \author Edward Anderson, Lockheed Martin
+!> \author Weslley Pereira, University of Colorado Denver, USA
 !
-!> \date August 2016
+!> \date December 2021
 !
 !> \ingroup OTHERauxiliary
 !
-!> \par Contributors:
-!  ==================
-!>
-!> Weslley Pereira, University of Colorado Denver, USA
-!
 !> \par Further Details:
 !  =====================
 !>
 !> \verbatim
 !>
+!> Based on the algorithm from
+!>
 !>  Anderson E. (2017)
 !>  Algorithm 978: Safe Scaling in the Level 1 BLAS
 !>  ACM Trans Math Softw 44:1--28
@@ -117,7 +115,7 @@
 subroutine ZLARTG( f, g, c, s, r )
    use LA_CONSTANTS, &
    only: wp=>dp, zero=>dzero, one=>done, two=>dtwo, czero=>zzero, &
-         rtmin=>drtmin, rtmax=>drtmax, safmin=>dsafmin, safmax=>dsafmax
+         safmin=>dsafmin, safmax=>dsafmax
 !
 !  -- LAPACK auxiliary routine --
 !  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -129,7 +127,7 @@ subroutine ZLARTG( f, g, c, s, r )
    complex(wp)        f, g, r, s
 !  ..
 !  .. Local Scalars ..
-   real(wp) :: d, f1, f2, g1, g2, h2, p, u, uu, v, vv, w
+   real(wp) :: d, f1, f2, g1, g2, h2, u, v, w, rtmin, rtmax
    complex(wp) :: fs, gs, t
 !  ..
 !  .. Intrinsic Functions ..
@@ -141,6 +139,9 @@ subroutine ZLARTG( f, g, c, s, r )
 !  .. Statement Function definitions ..
    ABSSQ( t ) = real( t )**2 + aimag( t )**2
 !  ..
+!  .. Constants ..
+   rtmin = sqrt( safmin )
+!  ..
 !  .. Executable Statements ..
 !
    if( g == czero ) then
@@ -149,30 +150,43 @@ subroutine ZLARTG( f, g, c, s, r )
       r = f
    else if( f == czero ) then
       c = zero
-      g1 = max( abs(real(g)), abs(aimag(g)) )
-      if( g1 > rtmin .and. g1 < rtmax ) then
+      if( real(g) == zero ) then
+         r = abs(aimag(g))
+         s = conjg( g ) / r
+      elseif( aimag(g) == zero ) then
+         r = abs(real(g))
+         s = conjg( g ) / r
+      else
+         g1 = max( abs(real(g)), abs(aimag(g)) )
+         rtmax = sqrt( safmax/2 )
+         if( g1 > rtmin .and. g1 < rtmax ) then
 !
 !        Use unscaled algorithm
 !
-         g2 = ABSSQ( g )
-         d = sqrt( g2 )
-         s = conjg( g ) / d
-         r = d
-      else
+!           The following two lines can be replaced by `d = abs( g )`.
+!           This algorithm do not use the intrinsic complex abs.
+            g2 = ABSSQ( g )
+            d = sqrt( g2 )
+            s = conjg( g ) / d
+            r = d
+         else
 !
 !        Use scaled algorithm
 !
-         u = min( safmax, max( safmin, g1 ) )
-         uu = one / u
-         gs = g*uu
-         g2 = ABSSQ( gs )
-         d = sqrt( g2 )
-         s = conjg( gs ) / d
-         r = d*u
+            u = min( safmax, max( safmin, g1 ) )
+            gs = g / u
+!           The following two lines can be replaced by `d = abs( gs )`.
+!           This algorithm do not use the intrinsic complex abs.
+            g2 = ABSSQ( gs )
+            d = sqrt( g2 )
+            s = conjg( gs ) / d
+            r = d*u
+         end if
       end if
    else
       f1 = max( abs(real(f)), abs(aimag(f)) )
       g1 = max( abs(real(g)), abs(aimag(g)) )
+      rtmax = sqrt( safmax/4 )
       if( f1 > rtmin .and. f1 < rtmax .and. &
           g1 > rtmin .and. g1 < rtmax ) then
 !
@@ -181,32 +195,51 @@ subroutine ZLARTG( f, g, c, s, r )
          f2 = ABSSQ( f )
          g2 = ABSSQ( g )
          h2 = f2 + g2
-         if( f2 > rtmin .and. h2 < rtmax ) then
-            d = sqrt( f2*h2 )
+         ! safmin <= f2 <= h2 <= safmax 
+         if( f2 >= h2 * safmin ) then
+            ! safmin <= f2/h2 <= 1, and h2/f2 is finite
+            c = sqrt( f2 / h2 )
+            r = f / c
+            rtmax = rtmax * 2
+            if( f2 > rtmin .and. h2 < rtmax ) then
+               ! safmin <= sqrt( f2*h2 ) <= safmax
+               s = conjg( g ) * ( f / sqrt( f2*h2 ) )
+            else
+               s = conjg( g ) * ( r / h2 )
+            end if
          else
-            d = sqrt( f2 )*sqrt( h2 )
+            ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow.
+            ! Moreover,
+            !  safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax,
+            !  sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax).
+            ! Also,
+            !  g2 >> f2, which means that h2 = g2.
+            d = sqrt( f2 * h2 )
+            c = f2 / d
+            if( c >= safmin ) then
+               r = f / c
+            else
+               ! f2 / sqrt(f2 * h2) < safmin, then
+               !  sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax
+               r = f * ( h2 / d )
+            end if
+            s = conjg( g ) * ( f / d )
          end if
-         p = 1 / d
-         c = f2*p
-         s = conjg( g )*( f*p )
-         r = f*( h2*p )
       else
 !
 !        Use scaled algorithm
 !
          u = min( safmax, max( safmin, f1, g1 ) )
-         uu = one / u
-         gs = g*uu
+         gs = g / u
          g2 = ABSSQ( gs )
-         if( f1*uu < rtmin ) then
+         if( f1 / u < rtmin ) then
 !
 !           f is not well-scaled when scaled by g1.
 !           Use a different scaling for f.
 !
             v = min( safmax, max( safmin, f1 ) )
-            vv = one / v
-            w = v * uu
-            fs = f*vv
+            w = v / u
+            fs = f / v
             f2 = ABSSQ( fs )
             h2 = f2*w**2 + g2
          else
@@ -214,19 +247,43 @@ subroutine ZLARTG( f, g, c, s, r )
 !           Otherwise use the same scaling for f and g.
 !
             w = one
-            fs = f*uu
+            fs = f / u
             f2 = ABSSQ( fs )
             h2 = f2 + g2
          end if
-         if( f2 > rtmin .and. h2 < rtmax ) then
-            d = sqrt( f2*h2 )
+         ! safmin <= f2 <= h2 <= safmax 
+         if( f2 >= h2 * safmin ) then
+            ! safmin <= f2/h2 <= 1, and h2/f2 is finite
+            c = sqrt( f2 / h2 )
+            r = fs / c
+            rtmax = rtmax * 2
+            if( f2 > rtmin .and. h2 < rtmax ) then
+               ! safmin <= sqrt( f2*h2 ) <= safmax
+               s = conjg( gs ) * ( fs / sqrt( f2*h2 ) )
+            else
+               s = conjg( gs ) * ( r / h2 )
+            end if
          else
-            d = sqrt( f2 )*sqrt( h2 )
+            ! f2/h2 <= safmin may be subnormal, and h2/f2 may overflow.
+            ! Moreover,
+            !  safmin <= f2*f2 * safmax < f2 * h2 < h2*h2 * safmin <= safmax,
+            !  sqrt(safmin) <= sqrt(f2 * h2) <= sqrt(safmax).
+            ! Also,
+            !  g2 >> f2, which means that h2 = g2.
+            d = sqrt( f2 * h2 )
+            c = f2 / d
+            if( c >= safmin ) then
+               r = fs / c
+            else
+               ! f2 / sqrt(f2 * h2) < safmin, then
+               !  sqrt(safmin) <= f2 * sqrt(safmax) <= h2 / sqrt(f2 * h2) <= h2 * (safmin / f2) <= h2 <= safmax
+               r = fs * ( h2 / d )
+            end if
+            s = conjg( gs ) * ( fs / d )
          end if
-         p = 1 / d
-         c = ( f2*p )*w
-         s = conjg( gs )*( fs*p )
-         r = ( fs*( h2*p ) )*u
+         ! Rescale c and r
+         c = c * w
+         r = r * u
       end if
    end if
    return

From 50aba029107ef79a7c4a8836955cd743f1cf2e59 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 21 Nov 2022 18:00:31 +0100
Subject: [PATCH 131/154] Simplify ?SYSWAPR and fix its documentation
 (Reference-LAPACK 217)

---
 lapack-netlib/SRC/csyswapr.f | 43 ++++++++++-----------------------
 lapack-netlib/SRC/dsyswapr.f | 47 ++++++++++++------------------------
 lapack-netlib/SRC/ssyswapr.f | 47 ++++++++++++------------------------
 lapack-netlib/SRC/zsyswapr.f | 47 ++++++++++++------------------------
 4 files changed, 58 insertions(+), 126 deletions(-)

diff --git a/lapack-netlib/SRC/csyswapr.f b/lapack-netlib/SRC/csyswapr.f
index 185d81922..04004f3c1 100644
--- a/lapack-netlib/SRC/csyswapr.f
+++ b/lapack-netlib/SRC/csyswapr.f
@@ -58,15 +58,13 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
-*>          On entry, the NB diagonal matrix D and the multipliers
-*>          used to obtain the factor U or L as computed by CSYTRF.
-*>
-*>          On exit, if INFO = 0, the (symmetric) inverse of the original
-*>          matrix.  If UPLO = 'U', the upper triangular part of the
-*>          inverse is formed and the part of A below the diagonal is not
-*>          referenced; if UPLO = 'L' the lower triangular part of the
-*>          inverse is formed and the part of A above the diagonal is
-*>          not referenced.
+*>          On entry, the N-by-N matrix A. On exit, the permuted matrix
+*>          where the rows I1 and I2 and columns I1 and I2 are interchanged.
+*>          If UPLO = 'U', the interchanges are applied to the upper
+*>          triangular part and the strictly lower triangular part of A is
+*>          not referenced; if UPLO = 'L', the interchanges are applied to
+*>          the lower triangular part and the part of A above the diagonal
+*>          is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDA
@@ -116,7 +114,6 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            UPPER
-      INTEGER            I
       COMPLEX            TMP
 *
 *     .. External Functions ..
@@ -143,19 +140,12 @@
          A(I1,I1)=A(I2,I2)
          A(I2,I2)=TMP
 *
-         DO I=1,I2-I1-1
-            TMP=A(I1,I1+I)
-            A(I1,I1+I)=A(I1+I,I2)
-            A(I1+I,I2)=TMP
-         END DO
+         CALL CSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 )
 *
 *          third swap
 *          - swap row I1 and I2 from I2+1 to N
-         DO I=I2+1,N
-            TMP=A(I1,I)
-            A(I1,I)=A(I2,I)
-            A(I2,I)=TMP
-         END DO
+         IF ( I2.LT.N )
+     $      CALL CSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA )
 *
         ELSE
 *
@@ -171,19 +161,12 @@
           A(I1,I1)=A(I2,I2)
           A(I2,I2)=TMP
 *
-          DO I=1,I2-I1-1
-             TMP=A(I1+I,I1)
-             A(I1+I,I1)=A(I2,I1+I)
-             A(I2,I1+I)=TMP
-          END DO
+          CALL CSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA )
 *
 *         third swap
 *          - swap col I1 and I2 from I2+1 to N
-          DO I=I2+1,N
-             TMP=A(I,I1)
-             A(I,I1)=A(I,I2)
-             A(I,I2)=TMP
-          END DO
+          IF ( I2.LT.N )
+     $        CALL CSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 )
 *
       ENDIF
       END SUBROUTINE CSYSWAPR
diff --git a/lapack-netlib/SRC/dsyswapr.f b/lapack-netlib/SRC/dsyswapr.f
index c60ccbefc..93f6195f2 100644
--- a/lapack-netlib/SRC/dsyswapr.f
+++ b/lapack-netlib/SRC/dsyswapr.f
@@ -57,16 +57,14 @@
 *>
 *> \param[in,out] A
 *> \verbatim
-*>          A is DOUBLE PRECISION array, dimension (LDA,N)
-*>          On entry, the NB diagonal matrix D and the multipliers
-*>          used to obtain the factor U or L as computed by DSYTRF.
-*>
-*>          On exit, if INFO = 0, the (symmetric) inverse of the original
-*>          matrix.  If UPLO = 'U', the upper triangular part of the
-*>          inverse is formed and the part of A below the diagonal is not
-*>          referenced; if UPLO = 'L' the lower triangular part of the
-*>          inverse is formed and the part of A above the diagonal is
-*>          not referenced.
+*>          A is DOUBLE PRECISION array, dimension (LDA,*)
+*>          On entry, the N-by-N matrix A. On exit, the permuted matrix
+*>          where the rows I1 and I2 and columns I1 and I2 are interchanged.
+*>          If UPLO = 'U', the interchanges are applied to the upper
+*>          triangular part and the strictly lower triangular part of A is
+*>          not referenced; if UPLO = 'L', the interchanges are applied to
+*>          the lower triangular part and the part of A above the diagonal
+*>          is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDA
@@ -109,14 +107,13 @@
       INTEGER          I1, I2, LDA, N
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION A( LDA, N )
+      DOUBLE PRECISION A( LDA, * )
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
       LOGICAL            UPPER
-      INTEGER            I
       DOUBLE PRECISION   TMP
 *
 *     .. External Functions ..
@@ -143,19 +140,12 @@
          A(I1,I1)=A(I2,I2)
          A(I2,I2)=TMP
 *
-         DO I=1,I2-I1-1
-            TMP=A(I1,I1+I)
-            A(I1,I1+I)=A(I1+I,I2)
-            A(I1+I,I2)=TMP
-         END DO
+         CALL DSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 )
 *
 *          third swap
 *          - swap row I1 and I2 from I2+1 to N
-         DO I=I2+1,N
-            TMP=A(I1,I)
-            A(I1,I)=A(I2,I)
-            A(I2,I)=TMP
-         END DO
+         IF ( I2.LT.N )
+     $      CALL DSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA )
 *
         ELSE
 *
@@ -171,19 +161,12 @@
           A(I1,I1)=A(I2,I2)
           A(I2,I2)=TMP
 *
-          DO I=1,I2-I1-1
-             TMP=A(I1+I,I1)
-             A(I1+I,I1)=A(I2,I1+I)
-             A(I2,I1+I)=TMP
-          END DO
+          CALL DSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA )
 *
 *         third swap
 *          - swap col I1 and I2 from I2+1 to N
-          DO I=I2+1,N
-             TMP=A(I,I1)
-             A(I,I1)=A(I,I2)
-             A(I,I2)=TMP
-          END DO
+         IF ( I2.LT.N )
+     $      CALL DSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 )
 *
       ENDIF
       END SUBROUTINE DSYSWAPR
diff --git a/lapack-netlib/SRC/ssyswapr.f b/lapack-netlib/SRC/ssyswapr.f
index 5e4265d7a..e1ab5a22a 100644
--- a/lapack-netlib/SRC/ssyswapr.f
+++ b/lapack-netlib/SRC/ssyswapr.f
@@ -57,16 +57,14 @@
 *>
 *> \param[in,out] A
 *> \verbatim
-*>          A is REAL array, dimension (LDA,N)
-*>          On entry, the NB diagonal matrix D and the multipliers
-*>          used to obtain the factor U or L as computed by SSYTRF.
-*>
-*>          On exit, if INFO = 0, the (symmetric) inverse of the original
-*>          matrix.  If UPLO = 'U', the upper triangular part of the
-*>          inverse is formed and the part of A below the diagonal is not
-*>          referenced; if UPLO = 'L' the lower triangular part of the
-*>          inverse is formed and the part of A above the diagonal is
-*>          not referenced.
+*>          A is REAL array, dimension (LDA,*)
+*>          On entry, the N-by-N matrix A. On exit, the permuted matrix
+*>          where the rows I1 and I2 and columns I1 and I2 are interchanged.
+*>          If UPLO = 'U', the interchanges are applied to the upper
+*>          triangular part and the strictly lower triangular part of A is
+*>          not referenced; if UPLO = 'L', the interchanges are applied to
+*>          the lower triangular part and the part of A above the diagonal
+*>          is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDA
@@ -109,14 +107,13 @@
       INTEGER          I1, I2, LDA, N
 *     ..
 *     .. Array Arguments ..
-      REAL             A( LDA, N )
+      REAL             A( LDA, * )
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
       LOGICAL            UPPER
-      INTEGER            I
       REAL               TMP
 *
 *     .. External Functions ..
@@ -143,19 +140,12 @@
          A(I1,I1)=A(I2,I2)
          A(I2,I2)=TMP
 *
-         DO I=1,I2-I1-1
-            TMP=A(I1,I1+I)
-            A(I1,I1+I)=A(I1+I,I2)
-            A(I1+I,I2)=TMP
-         END DO
+         CALL SSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 )
 *
 *          third swap
 *          - swap row I1 and I2 from I2+1 to N
-         DO I=I2+1,N
-            TMP=A(I1,I)
-            A(I1,I)=A(I2,I)
-            A(I2,I)=TMP
-         END DO
+         IF ( I2.LT.N )
+     $      CALL SSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA )
 *
         ELSE
 *
@@ -171,19 +161,12 @@
           A(I1,I1)=A(I2,I2)
           A(I2,I2)=TMP
 *
-          DO I=1,I2-I1-1
-             TMP=A(I1+I,I1)
-             A(I1+I,I1)=A(I2,I1+I)
-             A(I2,I1+I)=TMP
-          END DO
+          CALL SSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA )
 *
 *         third swap
 *          - swap col I1 and I2 from I2+1 to N
-          DO I=I2+1,N
-             TMP=A(I,I1)
-             A(I,I1)=A(I,I2)
-             A(I,I2)=TMP
-          END DO
+         IF ( I2.LT.N )
+     $      CALL SSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 )
 *
       ENDIF
       END SUBROUTINE SSYSWAPR
diff --git a/lapack-netlib/SRC/zsyswapr.f b/lapack-netlib/SRC/zsyswapr.f
index 1f1a87857..eb3c98c34 100644
--- a/lapack-netlib/SRC/zsyswapr.f
+++ b/lapack-netlib/SRC/zsyswapr.f
@@ -57,16 +57,14 @@
 *>
 *> \param[in,out] A
 *> \verbatim
-*>          A is COMPLEX*16 array, dimension (LDA,N)
-*>          On entry, the NB diagonal matrix D and the multipliers
-*>          used to obtain the factor U or L as computed by ZSYTRF.
-*>
-*>          On exit, if INFO = 0, the (symmetric) inverse of the original
-*>          matrix.  If UPLO = 'U', the upper triangular part of the
-*>          inverse is formed and the part of A below the diagonal is not
-*>          referenced; if UPLO = 'L' the lower triangular part of the
-*>          inverse is formed and the part of A above the diagonal is
-*>          not referenced.
+*>          A is COMPLEX*16 array, dimension (LDA,*)
+*>          On entry, the N-by-N matrix A. On exit, the permuted matrix
+*>          where the rows I1 and I2 and columns I1 and I2 are interchanged.
+*>          If UPLO = 'U', the interchanges are applied to the upper
+*>          triangular part and the strictly lower triangular part of A is
+*>          not referenced; if UPLO = 'L', the interchanges are applied to
+*>          the lower triangular part and the part of A above the diagonal
+*>          is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDA
@@ -109,14 +107,13 @@
       INTEGER          I1, I2, LDA, N
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16       A( LDA, N )
+      COMPLEX*16       A( LDA, * )
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
       LOGICAL            UPPER
-      INTEGER            I
       COMPLEX*16         TMP
 *
 *     .. External Functions ..
@@ -143,19 +140,12 @@
          A(I1,I1)=A(I2,I2)
          A(I2,I2)=TMP
 *
-         DO I=1,I2-I1-1
-            TMP=A(I1,I1+I)
-            A(I1,I1+I)=A(I1+I,I2)
-            A(I1+I,I2)=TMP
-         END DO
+         CALL ZSWAP( I2-I1-1, A(I1,I1+1), LDA, A(I1+1,I2), 1 )
 *
 *          third swap
 *          - swap row I1 and I2 from I2+1 to N
-         DO I=I2+1,N
-            TMP=A(I1,I)
-            A(I1,I)=A(I2,I)
-            A(I2,I)=TMP
-         END DO
+         IF ( I2.LT.N )
+     $      CALL ZSWAP( N-I2, A(I1,I2+1), LDA, A(I2,I2+1), LDA )
 *
         ELSE
 *
@@ -171,19 +161,12 @@
           A(I1,I1)=A(I2,I2)
           A(I2,I2)=TMP
 *
-          DO I=1,I2-I1-1
-             TMP=A(I1+I,I1)
-             A(I1+I,I1)=A(I2,I1+I)
-             A(I2,I1+I)=TMP
-          END DO
+          CALL ZSWAP( I2-I1-1, A(I1+1,I1), 1, A(I2,I1+1), LDA )
 *
 *         third swap
 *          - swap col I1 and I2 from I2+1 to N
-          DO I=I2+1,N
-             TMP=A(I,I1)
-             A(I,I1)=A(I,I2)
-             A(I,I2)=TMP
-          END DO
+         IF ( I2.LT.N )
+     $      CALL ZSWAP( N-I2, A(I2+1,I1), 1, A(I2+1,I2), 1 )
 *
       ENDIF
       END SUBROUTINE ZSYSWAPR

From c45edcb537564999cffd53e81555927fd6ff7d7b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 21 Nov 2022 19:59:33 +0100
Subject: [PATCH 132/154] Fix typo in comment (Reference-LAPACK PR735)

---
 lapack-netlib/SRC/ieeeck.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/SRC/ieeeck.f b/lapack-netlib/SRC/ieeeck.f
index 74065c3b4..f9f6332ec 100644
--- a/lapack-netlib/SRC/ieeeck.f
+++ b/lapack-netlib/SRC/ieeeck.f
@@ -41,7 +41,7 @@
 *> \param[in] ISPEC
 *> \verbatim
 *>          ISPEC is INTEGER
-*>          Specifies whether to test just for inifinity arithmetic
+*>          Specifies whether to test just for infinity arithmetic
 *>          or whether to test for infinity and NaN arithmetic.
 *>          = 0: Verify infinity arithmetic only.
 *>          = 1: Verify infinity and NaN arithmetic.

From f8f2bebf118880774fca7c2c443b3e088276e207 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 21 Nov 2022 20:01:47 +0100
Subject: [PATCH 133/154] Fix function documentation for LAPACK ?TPRFB
 (Reference-LAPACK PR665)

---
 lapack-netlib/SRC/ctprfb.f |  2 +-
 lapack-netlib/SRC/dtprfb.f |  2 +-
 lapack-netlib/SRC/stprfb.f | 86 +++++++++++++++++++-------------------
 lapack-netlib/SRC/ztprfb.f |  2 +-
 4 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/lapack-netlib/SRC/ctprfb.f b/lapack-netlib/SRC/ctprfb.f
index 11496180f..6cd5f05bd 100644
--- a/lapack-netlib/SRC/ctprfb.f
+++ b/lapack-netlib/SRC/ctprfb.f
@@ -1,4 +1,4 @@
-*> \brief \b CTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks.
+*> \brief \b CTPRFB applies a complex "triangular-pentagonal" block reflector to a complex matrix, which is composed of two blocks.
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/dtprfb.f b/lapack-netlib/SRC/dtprfb.f
index a3fc7d6c6..c015075b3 100644
--- a/lapack-netlib/SRC/dtprfb.f
+++ b/lapack-netlib/SRC/dtprfb.f
@@ -1,4 +1,4 @@
-*> \brief \b DTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks.
+*> \brief \b DTPRFB applies a real "triangular-pentagonal" block reflector to a real matrix, which is composed of two blocks.
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/stprfb.f b/lapack-netlib/SRC/stprfb.f
index 64e8b34f5..d91a80dfb 100644
--- a/lapack-netlib/SRC/stprfb.f
+++ b/lapack-netlib/SRC/stprfb.f
@@ -1,4 +1,4 @@
-*> \brief \b STPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks.
+*> \brief \b STPRFB applies a real "triangular-pentagonal" block reflector to a real matrix, which is composed of two blocks.
 *
 *  =========== DOCUMENTATION ===========
 *
@@ -37,7 +37,7 @@
 *> \verbatim
 *>
 *> STPRFB applies a real "triangular-pentagonal" block reflector H or its
-*> conjugate transpose H^H to a real matrix C, which is composed of two
+*> transpose H**T to a real matrix C, which is composed of two
 *> blocks A and B, either from the left or right.
 *>
 *> \endverbatim
@@ -48,15 +48,15 @@
 *> \param[in] SIDE
 *> \verbatim
 *>          SIDE is CHARACTER*1
-*>          = 'L': apply H or H^H from the Left
-*>          = 'R': apply H or H^H from the Right
+*>          = 'L': apply H or H**T from the Left
+*>          = 'R': apply H or H**T from the Right
 *> \endverbatim
 *>
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
 *>          = 'N': apply H (No transpose)
-*>          = 'C': apply H^H (Conjugate transpose)
+*>          = 'T': apply H**T (Transpose)
 *> \endverbatim
 *>
 *> \param[in] DIRECT
@@ -145,7 +145,7 @@
 *>          (LDA,N) if SIDE = 'L' or (LDA,K) if SIDE = 'R'
 *>          On entry, the K-by-N or M-by-K matrix A.
 *>          On exit, A is overwritten by the corresponding block of
-*>          H*C or H^H*C or C*H or C*H^H.  See Further Details.
+*>          H*C or H**T*C or C*H or C*H**T.  See Further Details.
 *> \endverbatim
 *>
 *> \param[in] LDA
@@ -161,7 +161,7 @@
 *>          B is REAL array, dimension (LDB,N)
 *>          On entry, the M-by-N matrix B.
 *>          On exit, B is overwritten by the corresponding block of
-*>          H*C or H^H*C or C*H or C*H^H.  See Further Details.
+*>          H*C or H**T*C or C*H or C*H**T.  See Further Details.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -327,13 +327,13 @@
 *        Let  W =  [ I ]    (K-by-K)
 *                  [ V ]    (M-by-K)
 *
-*        Form  H C  or  H^H C  where  C = [ A ]  (K-by-N)
-*                                         [ B ]  (M-by-N)
+*        Form  H C  or  H**T C  where  C = [ A ]  (K-by-N)
+*                                          [ B ]  (M-by-N)
 *
-*        H = I - W T W^H          or  H^H = I - W T^H W^H
+*        H = I - W T W**T          or  H**T = I - W T**T W**T
 *
-*        A = A -   T (A + V^H B)  or  A = A -   T^H (A + V^H B)
-*        B = B - V T (A + V^H B)  or  B = B - V T^H (A + V^H B)
+*        A = A -   T (A + V**T B)  or  A = A -   T**T (A + V**T B)
+*        B = B - V T (A + V**T B)  or  B = B - V T**T (A + V**T B)
 *
 * ---------------------------------------------------------------------------
 *
@@ -388,12 +388,12 @@
 *        Let  W =  [ I ]    (K-by-K)
 *                  [ V ]    (N-by-K)
 *
-*        Form  C H or  C H^H  where  C = [ A B ] (A is M-by-K, B is M-by-N)
+*        Form  C H or  C H**T  where  C = [ A B ] (A is M-by-K, B is M-by-N)
 *
-*        H = I - W T W^H          or  H^H = I - W T^H W^H
+*        H = I - W T W**T          or  H**T = I - W T**T W**T
 *
-*        A = A - (A + B V) T      or  A = A - (A + B V) T^H
-*        B = B - (A + B V) T V^H  or  B = B - (A + B V) T^H V^H
+*        A = A - (A + B V) T       or  A = A - (A + B V) T**T
+*        B = B - (A + B V) T V**T  or  B = B - (A + B V) T**T V**T
 *
 * ---------------------------------------------------------------------------
 *
@@ -448,13 +448,13 @@
 *        Let  W =  [ V ]    (M-by-K)
 *                  [ I ]    (K-by-K)
 *
-*        Form  H C  or  H^H C  where  C = [ B ]  (M-by-N)
-*                                         [ A ]  (K-by-N)
+*        Form  H C  or  H**T C  where  C = [ B ]  (M-by-N)
+*                                          [ A ]  (K-by-N)
 *
-*        H = I - W T W^H          or  H^H = I - W T^H W^H
+*        H = I - W T W**T         or  H**T = I - W T**T W**T
 *
-*        A = A -   T (A + V^H B)  or  A = A -   T^H (A + V^H B)
-*        B = B - V T (A + V^H B)  or  B = B - V T^H (A + V^H B)
+*        A = A -   T (A + V**T B)  or  A = A -   T**T (A + V**T B)
+*        B = B - V T (A + V**T B)  or  B = B - V T**T (A + V**T B)
 *
 * ---------------------------------------------------------------------------
 *
@@ -510,12 +510,12 @@
 *        Let  W =  [ V ]    (N-by-K)
 *                  [ I ]    (K-by-K)
 *
-*        Form  C H  or  C H^H  where  C = [ B A ] (B is M-by-N, A is M-by-K)
+*        Form  C H  or  C H**T  where  C = [ B A ] (B is M-by-N, A is M-by-K)
 *
-*        H = I - W T W^H          or  H^H = I - W T^H W^H
+*        H = I - W T W**T          or  H**T = I - W T**T W**T
 *
-*        A = A - (A + B V) T      or  A = A - (A + B V) T^H
-*        B = B - (A + B V) T V^H  or  B = B - (A + B V) T^H V^H
+*        A = A - (A + B V) T       or  A = A - (A + B V) T**T
+*        B = B - (A + B V) T V**T  or  B = B - (A + B V) T**T V**T
 *
 * ---------------------------------------------------------------------------
 *
@@ -569,13 +569,13 @@
 *
 *        Let  W =  [ I V ] ( I is K-by-K, V is K-by-M )
 *
-*        Form  H C  or  H^H C  where  C = [ A ]  (K-by-N)
-*                                         [ B ]  (M-by-N)
+*        Form  H C  or  H**T C  where  C = [ A ]  (K-by-N)
+*                                          [ B ]  (M-by-N)
 *
-*        H = I - W^H T W          or  H^H = I - W^H T^H W
+*        H = I - W**T T W          or  H**T = I - W**T T**T W
 *
-*        A = A -     T (A + V B)  or  A = A -     T^H (A + V B)
-*        B = B - V^H T (A + V B)  or  B = B - V^H T^H (A + V B)
+*        A = A -      T (A + V B)  or  A = A -      T**T (A + V B)
+*        B = B - V**T T (A + V B)  or  B = B - V**T T**T (A + V B)
 *
 * ---------------------------------------------------------------------------
 *
@@ -629,12 +629,12 @@
 *
 *        Let  W =  [ I V ] ( I is K-by-K, V is K-by-N )
 *
-*        Form  C H  or  C H^H  where  C = [ A B ] (A is M-by-K, B is M-by-N)
+*        Form  C H  or  C H**T  where  C = [ A B ] (A is M-by-K, B is M-by-N)
 *
-*        H = I - W^H T W            or  H^H = I - W^H T^H W
+*        H = I - W**T T W            or  H**T = I - W**T T**T W
 *
-*        A = A - (A + B V^H) T      or  A = A - (A + B V^H) T^H
-*        B = B - (A + B V^H) T V    or  B = B - (A + B V^H) T^H V
+*        A = A - (A + B V**T) T      or  A = A - (A + B V**T) T**T
+*        B = B - (A + B V**T) T V    or  B = B - (A + B V**T) T**T V
 *
 * ---------------------------------------------------------------------------
 *
@@ -688,13 +688,13 @@
 *
 *        Let  W =  [ V I ] ( I is K-by-K, V is K-by-M )
 *
-*        Form  H C  or  H^H C  where  C = [ B ]  (M-by-N)
-*                                         [ A ]  (K-by-N)
+*        Form  H C  or  H**T C  where  C = [ B ]  (M-by-N)
+*                                          [ A ]  (K-by-N)
 *
-*        H = I - W^H T W          or  H^H = I - W^H T^H W
+*        H = I - W**T T W          or  H**T = I - W**T T**T W
 *
-*        A = A -     T (A + V B)  or  A = A -     T^H (A + V B)
-*        B = B - V^H T (A + V B)  or  B = B - V^H T^H (A + V B)
+*        A = A -      T (A + V B)  or  A = A -      T**T (A + V B)
+*        B = B - V**T T (A + V B)  or  B = B - V**T T**T (A + V B)
 *
 * ---------------------------------------------------------------------------
 *
@@ -748,12 +748,12 @@
 *
 *        Let  W =  [ V I ] ( I is K-by-K, V is K-by-N )
 *
-*        Form  C H  or  C H^H  where  C = [ B A ] (A is M-by-K, B is M-by-N)
+*        Form  C H  or  C H**T  where  C = [ B A ] (A is M-by-K, B is M-by-N)
 *
-*        H = I - W^H T W            or  H^H = I - W^H T^H W
+*        H = I - W**T T W            or  H**T = I - W**T T**T W
 *
-*        A = A - (A + B V^H) T      or  A = A - (A + B V^H) T^H
-*        B = B - (A + B V^H) T V    or  B = B - (A + B V^H) T^H V
+*        A = A - (A + B V**T) T      or  A = A - (A + B V**T) T**T
+*        B = B - (A + B V**T) T V    or  B = B - (A + B V**T) T**T V
 *
 * ---------------------------------------------------------------------------
 *
diff --git a/lapack-netlib/SRC/ztprfb.f b/lapack-netlib/SRC/ztprfb.f
index 2edbd0566..7b1bc17a0 100644
--- a/lapack-netlib/SRC/ztprfb.f
+++ b/lapack-netlib/SRC/ztprfb.f
@@ -1,4 +1,4 @@
-*> \brief \b ZTPRFB applies a real or complex "triangular-pentagonal" blocked reflector to a real or complex matrix, which is composed of two blocks.
+*> \brief \b ZTPRFB applies a complex "triangular-pentagonal" block reflector to a complex matrix, which is composed of two blocks.
 *
 *  =========== DOCUMENTATION ===========
 *

From afcd7e88b6610bcd8dd504f43ce6fe545048242d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 21 Nov 2022 21:18:39 +0100
Subject: [PATCH 134/154] Improve convergence of DLAED4/SLAED4
 (Reference-LAPACK PR655)

---
 lapack-netlib/SRC/dlaed4.f | 7 +++++--
 lapack-netlib/SRC/slaed4.f | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/lapack-netlib/SRC/dlaed4.f b/lapack-netlib/SRC/dlaed4.f
index 3ee3ef920..b51e23d85 100644
--- a/lapack-netlib/SRC/dlaed4.f
+++ b/lapack-netlib/SRC/dlaed4.f
@@ -328,9 +328,12 @@
          IF( C.LT.ZERO )
      $      C = ABS( C )
          IF( C.EQ.ZERO ) THEN
-*          ETA = B/A
+*           ETA = B/A
 *           ETA = RHO - TAU
-            ETA = DLTUB - TAU
+*           ETA = DLTUB - TAU
+*
+*           Update proposed by Li, Ren-Cang:
+            ETA = -W / ( DPSI+DPHI )
          ELSE IF( A.GE.ZERO ) THEN
             ETA = ( A+SQRT( ABS( A*A-FOUR*B*C ) ) ) / ( TWO*C )
          ELSE
diff --git a/lapack-netlib/SRC/slaed4.f b/lapack-netlib/SRC/slaed4.f
index f056746d8..339c5029c 100644
--- a/lapack-netlib/SRC/slaed4.f
+++ b/lapack-netlib/SRC/slaed4.f
@@ -328,9 +328,12 @@
          IF( C.LT.ZERO )
      $      C = ABS( C )
          IF( C.EQ.ZERO ) THEN
-*          ETA = B/A
+*           ETA = B/A
 *           ETA = RHO - TAU
-            ETA = DLTUB - TAU
+*           ETA = DLTUB - TAU
+*
+*           Update proposed by Li, Ren-Cang:
+            ETA = -W / ( DPSI+DPHI )
          ELSE IF( A.GE.ZERO ) THEN
             ETA = ( A+SQRT( ABS( A*A-FOUR*B*C ) ) ) / ( TWO*C )
          ELSE

From d3213575586c4fa3d3d5654247a184d2ddece7e3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 21 Nov 2022 21:19:44 +0100
Subject: [PATCH 135/154] Fix bug in DORCSD2BY1 (from Reference-LAPACK PR697)

---
 lapack-netlib/SRC/dorcsd2by1.f | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lapack-netlib/SRC/dorcsd2by1.f b/lapack-netlib/SRC/dorcsd2by1.f
index 06bf53db1..25fab0f33 100644
--- a/lapack-netlib/SRC/dorcsd2by1.f
+++ b/lapack-netlib/SRC/dorcsd2by1.f
@@ -580,7 +580,7 @@
 *        Simultaneously diagonalize X11 and X21.
 *
          CALL DBBCSD( JOBV1T, 'N', JOBU1, JOBU2, 'T', M, Q, P, THETA,
-     $                WORK(IPHI), V1T, LDV1T, DUM2, 1, U1, LDU1, U2,
+     $                WORK(IPHI), V1T, LDV1T, DUM1, 1, U1, LDU1, U2,
      $                LDU2, WORK(IB11D), WORK(IB11E), WORK(IB12D),
      $                WORK(IB12E), WORK(IB21D), WORK(IB21E),
      $                WORK(IB22D), WORK(IB22E), WORK(IBBCSD), LBBCSD,
@@ -635,7 +635,7 @@
 *        Simultaneously diagonalize X11 and X21.
 *
          CALL DBBCSD( 'N', JOBV1T, JOBU2, JOBU1, 'T', M, M-Q, M-P,
-     $                THETA, WORK(IPHI), DUM2, 1, V1T, LDV1T, U2,
+     $                THETA, WORK(IPHI), DUM1, 1, V1T, LDV1T, U2,
      $                LDU2, U1, LDU1, WORK(IB11D), WORK(IB11E),
      $                WORK(IB12D), WORK(IB12E), WORK(IB21D),
      $                WORK(IB21E), WORK(IB22D), WORK(IB22E),
@@ -706,7 +706,7 @@
 *        Simultaneously diagonalize X11 and X21.
 *
          CALL DBBCSD( JOBU2, JOBU1, 'N', JOBV1T, 'N', M, M-P, M-Q,
-     $                THETA, WORK(IPHI), U2, LDU2, U1, LDU1, DUM2,
+     $                THETA, WORK(IPHI), U2, LDU2, U1, LDU1, DUM1,
      $                1, V1T, LDV1T, WORK(IB11D), WORK(IB11E),
      $                WORK(IB12D), WORK(IB12E), WORK(IB21D),
      $                WORK(IB21E), WORK(IB22D), WORK(IB22E),

From 8408357bab29276844378ed360b5e64e06b5e8ec Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 22 Nov 2022 14:01:48 +0100
Subject: [PATCH 136/154] Update LAPACK version number to 3.11.0

---
 lapack-netlib/INSTALL/ilaver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/INSTALL/ilaver.c b/lapack-netlib/INSTALL/ilaver.c
index 83ef3e0d8..b274af292 100644
--- a/lapack-netlib/INSTALL/ilaver.c
+++ b/lapack-netlib/INSTALL/ilaver.c
@@ -573,7 +573,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 /*  ===================================================================== */
     *vers_major__ = 3;
-    *vers_minor__ = 9;
+    *vers_minor__ = 11;
     *vers_patch__ = 0;
 /*  ===================================================================== */
 

From e6e2a63650bf6f789693fd5ec788d3f35371196c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 22 Nov 2022 14:02:21 +0100
Subject: [PATCH 137/154] Update LAPACK version number to 3.11.0

---
 lapack-netlib/INSTALL/ilaver.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f
index 79fe597ae..a246c37cb 100644
--- a/lapack-netlib/INSTALL/ilaver.f
+++ b/lapack-netlib/INSTALL/ilaver.f
@@ -60,7 +60,7 @@
       INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH
 *  =====================================================================
       VERS_MAJOR = 3
-      VERS_MINOR = 9
+      VERS_MINOR = 11
       VERS_PATCH = 0
 *  =====================================================================
 *

From 0b2f8dabbf9e6c4de8e4b62b3a8df96097a2c23f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 10:30:35 +0100
Subject: [PATCH 138/154] Fix array dimension (Reference-LAPACK 758)

---
 lapack-netlib/TESTING/EIG/csyl01.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f
index e21f1a7a0..82d790daa 100644
--- a/lapack-netlib/TESTING/EIG/csyl01.f
+++ b/lapack-netlib/TESTING/EIG/csyl01.f
@@ -124,7 +124,7 @@
      $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
      $                   X( MAXM, MAXN ),
      $                   DUML( MAXM ), DUMR( MAXN ),
-     $                   D( MIN( MAXM, MAXN ) )
+     $                   D( MAX( MAXM, MAXN ) )
       REAL               SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 )
       INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
 *     ..

From bc3393f703dadb95b7d69ca61a74bff793841fc5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 10:31:18 +0100
Subject: [PATCH 139/154] Fix array dimension (Reference-LAPACK 758)

---
 lapack-netlib/TESTING/EIG/zsyl01.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f
index 1e8619a34..329f39dc4 100644
--- a/lapack-netlib/TESTING/EIG/zsyl01.f
+++ b/lapack-netlib/TESTING/EIG/zsyl01.f
@@ -124,7 +124,7 @@
      $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
      $                   X( MAXM, MAXN ),
      $                   DUML( MAXM ), DUMR( MAXN ),
-     $                   D( MIN( MAXM, MAXN ) )
+     $                   D( MAX( MAXM, MAXN ) )
       DOUBLE PRECISION   SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 )
       INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
 *     ..

From 730ed549e6f0cfced4c4874da012baf1e464fdeb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 10:35:23 +0100
Subject: [PATCH 140/154] Fix typo in EXTERNAL (Reference-LAPACK PR760)

---
 lapack-netlib/TESTING/EIG/derred.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/EIG/derred.f b/lapack-netlib/TESTING/EIG/derred.f
index 6df517825..11a932052 100644
--- a/lapack-netlib/TESTING/EIG/derred.f
+++ b/lapack-netlib/TESTING/EIG/derred.f
@@ -99,7 +99,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, DGEES, DGEESX, DGEEV, DGEEVX, DGEJSV,
-     $                   DGESDD, DGESVD, DGESVDX, DGESVQ
+     $                   DGESDD, DGESVD, DGESVDX, DGESVDQ
 *     ..
 *     .. External Functions ..
       LOGICAL            DSLECT, LSAMEN

From 825ae316e2195997349958479ce27da01e9fb77e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 10:36:10 +0100
Subject: [PATCH 141/154] Fix typo in EXTERNAL (Reference-LAPACK PR760)

---
 lapack-netlib/TESTING/EIG/zerred.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/EIG/zerred.f b/lapack-netlib/TESTING/EIG/zerred.f
index d1219c02b..1876c1f1d 100644
--- a/lapack-netlib/TESTING/EIG/zerred.f
+++ b/lapack-netlib/TESTING/EIG/zerred.f
@@ -100,7 +100,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, ZGEES, ZGEESX, ZGEEV, ZGEEVX, ZGESVJ,
-     $                   ZGESDD, ZGESVD, ZGESVDX, ZGESVQ
+     $                   ZGESDD, ZGESVD, ZGESVDX, ZGESVDQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAMEN, ZSLECT

From 7694ff495f26c29cc24771a29ccd13f4191a2baf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 10:40:59 +0100
Subject: [PATCH 142/154] Remove unnecessary return in void function call
 (Reference-LAPACK PR760)

---
 lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c
index 8910aee7d..48d346611 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_ctz_trans.c
@@ -147,7 +147,7 @@ void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo,
     }
 
     /* Copy & transpose triangular part */
-    return LAPACKE_ctr_trans( matrix_layout, uplo, diag, tri_n,
-                              &in[tri_in_offset], ldin,
-                              &out[tri_out_offset], ldout );
+        LAPACKE_ctr_trans( matrix_layout, uplo, diag, tri_n,
+                            &in[tri_in_offset], ldin,
+                            &out[tri_out_offset], ldout );
 }

From d952cbf7bc23c1cb8fb7d2bdf4ddd888f4860905 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 10:41:50 +0100
Subject: [PATCH 143/154] Remove unnecessary return in void function call
 (Reference-LAPACK PR760)

---
 lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c
index 80d94ead9..b39000d42 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_dtz_trans.c
@@ -147,7 +147,7 @@ void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo,
     }
 
     /* Copy & transpose triangular part */
-    return LAPACKE_dtr_trans( matrix_layout, uplo, diag, tri_n,
-                              &in[tri_in_offset], ldin,
-                              &out[tri_out_offset], ldout );
+        LAPACKE_dtr_trans( matrix_layout, uplo, diag, tri_n,
+                            &in[tri_in_offset], ldin,
+                            &out[tri_out_offset], ldout );
 }

From 74962c7f53110d915b4597344d2b397c150a9936 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 10:42:29 +0100
Subject: [PATCH 144/154] Remove unnecessary return in void function call
 (Reference-LAPACK PR760)

---
 lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c
index 793f3833d..cffee6c98 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_stz_trans.c
@@ -147,7 +147,7 @@ void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo,
     }
 
     /* Copy & transpose triangular part */
-    return LAPACKE_str_trans( matrix_layout, uplo, diag, tri_n,
-                              &in[tri_in_offset], ldin,
-                              &out[tri_out_offset], ldout );
+        LAPACKE_str_trans( matrix_layout, uplo, diag, tri_n,
+                            &in[tri_in_offset], ldin,
+                            &out[tri_out_offset], ldout );
 }

From c2ba4e6249e11661d9b8b5f0717a8256a1068143 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 10:43:34 +0100
Subject: [PATCH 145/154] Remove unnecessary return in void function call
 (Reference-LAPACK PR760)

---
 lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c
index 881052331..faef6da50 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_ztz_trans.c
@@ -147,7 +147,7 @@ void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo,
     }
 
     /* Copy & transpose triangular part */
-    return LAPACKE_ztr_trans( matrix_layout, uplo, diag, tri_n,
-                              &in[tri_in_offset], ldin,
-                              &out[tri_out_offset], ldout );
+        LAPACKE_ztr_trans( matrix_layout, uplo, diag, tri_n,
+                            &in[tri_in_offset], ldin,
+                            &out[tri_out_offset], ldout );
 }

From 19fd2d7f00325b846a1693eeb57ba2ad21ead8d5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 15:19:07 +0100
Subject: [PATCH 146/154] Use LSAME for character comparison (Reference-LAPACK
 PR755)

---
 lapack-netlib/SRC/iparam2stage.F | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F
index c153eef22..c701c2be0 100644
--- a/lapack-netlib/SRC/iparam2stage.F
+++ b/lapack-netlib/SRC/iparam2stage.F
@@ -178,7 +178,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      LOGICAL            LSAME
+      EXTERNAL           ILAENV, LSAME
 *     ..
 *     .. Executable Statements ..
 *
@@ -310,7 +311,7 @@
 *
 *     Will add the VECT OPTION HERE next release
          VECT  = OPTS(1:1)
-         IF( VECT.EQ.'N' ) THEN
+         IF( LSAME( VECT, 'N' ) ) THEN
             LHOUS = MAX( 1, 4*NI )
          ELSE
 *           This is not correct, it need to call the ALGO and the stage2

From 0d26f1a4c7dc7e3829b3bde8f7d2e3a93f9fbbd3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 15:22:27 +0100
Subject: [PATCH 147/154] Fix wrong indexation in test (Reference-LAPACK PR755)

---
 lapack-netlib/TESTING/LIN/schktr.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lapack-netlib/TESTING/LIN/schktr.f b/lapack-netlib/TESTING/LIN/schktr.f
index 5aeb1ce88..33f07726e 100644
--- a/lapack-netlib/TESTING/LIN/schktr.f
+++ b/lapack-netlib/TESTING/LIN/schktr.f
@@ -559,7 +559,7 @@
      $                            -1, -1, -1, IMAT, NFAIL, NERRS, NOUT )
 *
                   CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
-     $                         SCALE3 ( 1 ), RWORK, ONE, B( N+1 ), LDA,
+     $                         SCALE3 ( 1 ), RWORK, ONE, B( 1 ), LDA,
      $                         X, LDA, WORK, RESULT( 10 ) )
                   CALL SSCAL( N, BIGNUM, X, 1 )
                   CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,

From b0393ea4e17d4910043224791999d972f031cfde Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 15:27:46 +0100
Subject: [PATCH 148/154] Fix test (Reference-LAPACK PR764)

---
 lapack-netlib/TESTING/LIN/schktr.f | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lapack-netlib/TESTING/LIN/schktr.f b/lapack-netlib/TESTING/LIN/schktr.f
index 33f07726e..92d876108 100644
--- a/lapack-netlib/TESTING/LIN/schktr.f
+++ b/lapack-netlib/TESTING/LIN/schktr.f
@@ -555,11 +555,11 @@
 *
                   IF( INFO.NE.0 )
      $               CALL ALAERH( PATH, 'SLATRS3', INFO, 0,
-     $                            UPLO // TRANS // DIAG // 'Y', N, N,
+     $                            UPLO // TRANS // DIAG // 'N', N, N,
      $                            -1, -1, -1, IMAT, NFAIL, NERRS, NOUT )
 *
                   CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
-     $                         SCALE3 ( 1 ), RWORK, ONE, B( 1 ), LDA,
+     $                         SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA,
      $                         X, LDA, WORK, RESULT( 10 ) )
                   CALL SSCAL( N, BIGNUM, X, 1 )
                   CALL STRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,

From a5470521ee4737060da10ee6bd97d229a19d49e9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 23 Nov 2022 15:31:25 +0100
Subject: [PATCH 149/154] Fix array indexation in copy, and fix test
 (Reference-LAPACK PR764)

---
 lapack-netlib/TESTING/LIN/cchktr.f | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f
index c55b07643..4b09361d8 100644
--- a/lapack-netlib/TESTING/LIN/cchktr.f
+++ b/lapack-netlib/TESTING/LIN/cchktr.f
@@ -541,7 +541,7 @@
 *
                   SRNAMT = 'CLATRS3'
                   CALL CCOPY( N, X, 1, B, 1 )
-                  CALL CCOPY( N, X, 1, B, 1 )
+                  CALL CCOPY( N, X, 1, B( N+1 ), 1 )
                   CALL CSCAL( N, BIGNUM, B( N+1 ), 1 )
                   CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
      $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
@@ -551,7 +551,7 @@
 *
                   IF( INFO.NE.0 )
      $               CALL ALAERH( PATH, 'CLATRS3', INFO, 0,
-     $                            UPLO // TRANS // DIAG // 'Y', N, N,
+     $                            UPLO // TRANS // DIAG // 'N', N, N,
      $                            -1, -1, -1, IMAT, NFAIL, NERRS, NOUT )
                   CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
      $                         SCALE3( 1 ), RWORK, ONE, B( 1 ), LDA,
@@ -559,7 +559,7 @@
                   CALL CSSCAL( N, BIGNUM, X, 1 )
                   CALL CTRT03( UPLO, TRANS, DIAG, N, 1, A, LDA,
      $                         SCALE3( 2 ), RWORK, ONE, B( N+1 ), LDA,
-     $                         X, LDA, WORK, RESULT( 10 ) )
+     $                         X, LDA, WORK, RES )
                   RESULT( 10 ) = MAX( RESULT( 10 ), RES )
 *
 *                 Print information about the tests that did not pass

From 4f7b77e08aa77f6e907e858f3160ecbffe27027e Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 25 Nov 2022 15:24:32 +0000
Subject: [PATCH 150/154] Remove unnecessary instructions from Advanced SIMD
 dot

The existing kernel was issuing extra instructions to organise the arguments into the same registers they would usually be in and similarly to put the result into the appropriate register.

This has an impact on smaller sized dots and seemed like a quick fix
---
 kernel/arm64/dot_thunderx2t99.c | 247 +++++++++++++++-----------------
 1 file changed, 118 insertions(+), 129 deletions(-)

diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_thunderx2t99.c
index 3940acddd..9131f1e86 100644
--- a/kernel/arm64/dot_thunderx2t99.c
+++ b/kernel/arm64/dot_thunderx2t99.c
@@ -1,5 +1,6 @@
 /***************************************************************************
 Copyright (c) 2017, The OpenBLAS Project
+Copyright (c) 2022, Arm Ltd
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -36,25 +37,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RETURN_TYPE	double
 #endif
 
-#define N		"x0"	/* vector length */
-#define X		"x1"	/* "X" vector address */
-#define INC_X		"x2"	/* "X" stride */
-#define Y		"x3"	/* "Y" vector address */
-#define INC_Y		"x4"	/* "Y" stride */
-#define J		"x5"	/* loop variable */
-
 #if !defined(DOUBLE)
 #if !defined(DSDOT)
+#define DOT_MOD	"s"
 #define REG0		"wzr"
-#define DOTF		"s0"
 #define TMPX		"s16"
 #define TMPY		"s24"
 #define INC_SHIFT	"2"
 #define N_DIV_SHIFT	"6"
 #define N_REM_MASK	"63"
 #else
+#define DOT_MOD	"d"
 #define REG0		"xzr"
-#define DOTF		"d0"
 #define TMPX		"s16"
 #define TMPX1		"d2"
 #define TMPY		"s24"
@@ -64,8 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define N_REM_MASK	"15"
 #endif
 #else
+#define DOT_MOD	"d"
 #define REG0		"xzr"
-#define DOTF		"d0"
 #define TMPX		"d16"
 #define TMPY		"d24"
 #define INC_SHIFT	"3"
@@ -73,59 +67,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define N_REM_MASK	"31"
 #endif
 
+#define OUT		"%"DOT_MOD"[DOT_]"
+
 #if !defined(DOUBLE)
 
 #if !defined(DSDOT)
 #define KERNEL_F1						\
-	"	ldr	"TMPX", ["X"]			\n"	\
-	"	ldr	"TMPY", ["Y"]			\n"	\
-	"	add	"X", "X", "INC_X"		\n"	\
-	"	add	"Y", "Y", "INC_Y"		\n"	\
-	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"  \n"
+	"	ldr	"TMPX", [%[X_]]			\n"	\
+	"	ldr	"TMPY", [%[Y_]]			\n"	\
+	"	add	%[X_], %[X_], %[INCX_]		\n"	\
+	"	add	%[Y_], %[Y_], %[INCY_]		\n"	\
+	"	fmadd	"OUT", "TMPX", "TMPY", "OUT"    \n"
 
 #define KERNEL_F						\
-	"	ldp	q16, q17, ["X"]			\n"	\
-	"	ldp	q24, q25, ["Y"]			\n"	\
-	"	ldp	q18, q19, ["X", #32]		\n"	\
-	"	ldp	q26, q27, ["Y", #32]		\n"	\
+	"	ldp	q16, q17, [%[X_]]		\n"	\
+	"	ldp	q24, q25, [%[Y_]]		\n"	\
+	"	ldp	q18, q19, [%[X_], #32]		\n"	\
+	"	ldp	q26, q27, [%[Y_], #32]		\n"	\
 	"	fmla	v0.4s, v16.4s, v24.4s		\n"	\
 	"	fmla	v1.4s, v17.4s, v25.4s		\n"	\
-	"	ldp	q20, q21, ["X", #64]		\n"	\
-	"	ldp	q28, q29, ["Y", #64]		\n"	\
+	"	ldp	q20, q21, [%[X_], #64]		\n"	\
+	"	ldp	q28, q29, [%[Y_], #64]		\n"	\
 	"	fmla	v2.4s, v18.4s, v26.4s		\n"	\
 	"	fmla	v3.4s, v19.4s, v27.4s		\n"	\
-	"	ldp	q22, q23, ["X", #96]		\n"	\
-	"	ldp	q30, q31, ["Y", #96]		\n"	\
-	"	add	"Y", "Y", #128			\n"	\
-	"	add	"X", "X", #128			\n"	\
+	"	ldp	q22, q23, [%[X_], #96]		\n"	\
+	"	ldp	q30, q31, [%[Y_], #96]		\n"	\
+	"	add	%[Y_], %[Y_], #128		\n"	\
+	"	add	%[X_], %[X_], #128		\n"	\
 	"	fmla	v4.4s, v20.4s, v28.4s		\n"	\
 	"	fmla	v5.4s, v21.4s, v29.4s		\n"	\
-	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\
-	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\
-	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\
-	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[X_], #896]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[Y_], #896]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[X_], #896+64]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[Y_], #896+64]	\n"	\
 	"	fmla	v6.4s, v22.4s, v30.4s		\n"	\
 	"	fmla	v7.4s, v23.4s, v31.4s		\n"	\
-	"	ldp	q16, q17, ["X"]			\n"	\
-	"	ldp	q24, q25, ["Y"]			\n"	\
-	"	ldp	q18, q19, ["X", #32]		\n"	\
-	"	ldp	q26, q27, ["Y", #32]		\n"	\
+	"	ldp	q16, q17, [%[X_]]		\n"	\
+	"	ldp	q24, q25, [%[Y_]]		\n"	\
+	"	ldp	q18, q19, [%[X_], #32]		\n"	\
+	"	ldp	q26, q27, [%[Y_], #32]		\n"	\
 	"	fmla	v0.4s, v16.4s, v24.4s		\n"	\
 	"	fmla	v1.4s, v17.4s, v25.4s		\n"	\
-	"	ldp	q20, q21, ["X", #64]		\n"	\
-	"	ldp	q28, q29, ["Y", #64]		\n"	\
+	"	ldp	q20, q21, [%[X_], #64]		\n"	\
+	"	ldp	q28, q29, [%[Y_], #64]		\n"	\
 	"	fmla	v2.4s, v18.4s, v26.4s		\n"	\
 	"	fmla	v3.4s, v19.4s, v27.4s		\n"	\
-	"	ldp	q22, q23, ["X", #96]		\n"	\
-	"	ldp	q30, q31, ["Y", #96]		\n"	\
-	"	add	"Y", "Y", #128			\n"	\
-	"	add	"X", "X", #128			\n"	\
+	"	ldp	q22, q23, [%[X_], #96]		\n"	\
+	"	ldp	q30, q31, [%[Y_], #96]		\n"	\
+	"	add	%[Y_], %[Y_], #128		\n"	\
+	"	add	%[X_], %[X_], #128		\n"	\
 	"	fmla	v4.4s, v20.4s, v28.4s		\n"	\
 	"	fmla	v5.4s, v21.4s, v29.4s		\n"	\
-	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\
-	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\
-	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\
-	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[X_], #896]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[Y_], #896]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[X_], #896+64]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[Y_], #896+64]	\n"	\
 	"	fmla	v6.4s, v22.4s, v30.4s		\n"	\
 	"	fmla	v7.4s, v23.4s, v31.4s		\n"
 
@@ -142,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #else /* !defined(DSDOT) */
 #define KERNEL_F1						\
-	"	ldr	"TMPX", ["X"]			\n"	\
-	"	ldr	"TMPY", ["Y"]			\n"	\
-	"	add	"X", "X", "INC_X"		\n"	\
-	"	add	"Y", "Y", "INC_Y"		\n"	\
+	"	ldr	"TMPX", [%[X_]]			\n"	\
+	"	ldr	"TMPY", [%[Y_]]			\n"	\
+	"	add	%[X_], %[X_], %[INCX_]		\n"	\
+	"	add	%[Y_], %[Y_], %[INCY_]		\n"	\
 	"	fcvt	"TMPX1", "TMPX"			\n"	\
 	"	fcvt	"TMPY1", "TMPY"			\n"	\
 	"	fmul	"TMPX1", "TMPX1", "TMPY1"	\n"	\
-	"	fadd	"DOTF", "DOTF", "TMPX1"		\n"
+	"	fadd	"OUT", "OUT", "TMPX1"	 	\n"
 
 
 #define KERNEL_F						\
-	"	ldp	q18, q19, ["X"]			\n"	\
-	"	ldp	q26, q27, ["Y"]			\n"	\
+	"	ldp	q18, q19, [%[X_]]		\n"	\
+	"	ldp	q26, q27, [%[Y_]]		\n"	\
 	"	fcvtl	v16.2d, v18.2s			\n"	\
 	"	fcvtl2	v17.2d, v18.4s			\n"	\
 	"	fcvtl	v18.2d, v19.2s			\n"	\
@@ -163,8 +159,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	"	fcvtl2	v25.2d, v26.4s			\n"	\
 	"	fcvtl	v26.2d, v27.2s			\n"	\
 	"	fcvtl2	v27.2d, v27.4s			\n"	\
-	"	ldp	q22, q23, ["X", #32]		\n"	\
-	"	ldp	q30, q31, ["Y", #32]		\n"	\
+	"	ldp	q22, q23, [%[X_], #32]		\n"	\
+	"	ldp	q30, q31, [%[Y_], #32]		\n"	\
 	"	fcvtl	v20.2d, v22.2s			\n"	\
 	"	fcvtl2	v21.2d, v22.4s			\n"	\
 	"	fcvtl	v22.2d, v23.2s			\n"	\
@@ -173,16 +169,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	"	fcvtl2	v29.2d, v30.4s			\n"	\
 	"	fcvtl	v30.2d, v31.2s			\n"	\
 	"	fcvtl2	v31.2d, v31.4s			\n"	\
-	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\
-	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\
-	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\
-	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[X_], #896]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[Y_], #896]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[X_], #896+64]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[Y_], #896+64]	\n"	\
 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\
 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\
 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\
 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\
-	"	add	"Y", "Y", #64			\n"	\
-	"	add	"X", "X", #64			\n"	\
+	"	add	%[Y_], %[Y_], #64		\n"	\
+	"	add	%[X_], %[X_], #64		\n"	\
 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\
 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\
 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\
@@ -196,60 +192,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	"	fadd	v0.2d, v0.2d, v2.2d		\n"	\
 	"	fadd	v4.2d, v4.2d, v6.2d		\n"	\
 	"	fadd	v0.2d, v0.2d, v4.2d		\n"	\
-	"	faddp	"DOTF", v0.2d			\n"
+	"	faddp	"OUT", v0.2d			\n"
 #endif /* !defined(DSDOT) */
 
 #else /* !defined(DOUBLE) */
 #define KERNEL_F1						\
-	"	ldr	"TMPX", ["X"]			\n"	\
-	"	ldr	"TMPY", ["Y"]			\n"	\
-	"	add	"X", "X", "INC_X"		\n"	\
-	"	add	"Y", "Y", "INC_Y"		\n"	\
-	"	fmadd	"DOTF", "TMPX", "TMPY", "DOTF"  \n"
+	"	ldr	"TMPX", [%[X_]]			\n"	\
+	"	ldr	"TMPY", [%[Y_]]			\n"	\
+	"	add	%[X_], %[X_], %[INCX_]		\n"	\
+	"	add	%[Y_], %[Y_], %[INCY_]		\n"	\
+	"	fmadd	"OUT", "TMPX", "TMPY", "OUT"    \n"
 
 #define KERNEL_F						\
-	"	ldp	q16, q17, ["X"]			\n"	\
-	"	ldp	q24, q25, ["Y"]			\n"	\
-	"	ldp	q18, q19, ["X", #32]		\n"	\
-	"	ldp	q26, q27, ["Y", #32]		\n"	\
+	"	ldp	q16, q17, [%[X_]]		\n"	\
+	"	ldp	q24, q25, [%[Y_]]		\n"	\
+	"	ldp	q18, q19, [%[X_], #32]		\n"	\
+	"	ldp	q26, q27, [%[Y_], #32]		\n"	\
 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\
 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\
-	"	ldp	q20, q21, ["X", #64]		\n"	\
-	"	ldp	q28, q29, ["Y", #64]		\n"	\
+	"	ldp	q20, q21, [%[X_], #64]		\n"	\
+	"	ldp	q28, q29, [%[Y_], #64]		\n"	\
 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\
 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\
-	"	ldp	q22, q23, ["X", #96]		\n"	\
-	"	ldp	q30, q31, ["Y", #96]		\n"	\
-	"	add	"Y", "Y", #128			\n"	\
-	"	add	"X", "X", #128			\n"	\
+	"	ldp	q22, q23, [%[X_], #96]		\n"	\
+	"	ldp	q30, q31, [%[Y_], #96]		\n"	\
+	"	add	%[Y_], %[Y_], #128		\n"	\
+	"	add	%[X_], %[X_], #128		\n"	\
 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\
 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\
-	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\
-	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\
-	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\
-	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[X_], #896]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[Y_], #896]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[X_], #896+64]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[Y_], #896+64]	\n"	\
 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\
 	"	fmla	v7.2d, v23.2d, v31.2d		\n"	\
-	"	ldp	q16, q17, ["X"]			\n"	\
-	"	ldp	q24, q25, ["Y"]			\n"	\
-	"	ldp	q18, q19, ["X", #32]		\n"	\
-	"	ldp	q26, q27, ["Y", #32]		\n"	\
+	"	ldp	q16, q17, [%[X_]]		\n"	\
+	"	ldp	q24, q25, [%[Y_]]		\n"	\
+	"	ldp	q18, q19, [%[X_], #32]		\n"	\
+	"	ldp	q26, q27, [%[Y_], #32]		\n"	\
 	"	fmla	v0.2d, v16.2d, v24.2d		\n"	\
 	"	fmla	v1.2d, v17.2d, v25.2d		\n"	\
-	"	ldp	q20, q21, ["X", #64]		\n"	\
-	"	ldp	q28, q29, ["Y", #64]		\n"	\
+	"	ldp	q20, q21, [%[X_], #64]		\n"	\
+	"	ldp	q28, q29, [%[Y_], #64]		\n"	\
 	"	fmla	v2.2d, v18.2d, v26.2d		\n"	\
 	"	fmla	v3.2d, v19.2d, v27.2d		\n"	\
-	"	ldp	q22, q23, ["X", #96]		\n"	\
-	"	ldp	q30, q31, ["Y", #96]		\n"	\
-	"	add	"Y", "Y", #128			\n"	\
-	"	add	"X", "X", #128			\n"	\
+	"	ldp	q22, q23, [%[X_], #96]		\n"	\
+	"	ldp	q30, q31, [%[Y_], #96]		\n"	\
+	"	add	%[Y_], %[Y_], #128		\n"	\
+	"	add	%[X_], %[X_], #128		\n"	\
 	"	fmla	v4.2d, v20.2d, v28.2d		\n"	\
 	"	fmla	v5.2d, v21.2d, v29.2d		\n"	\
-	"	PRFM	PLDL1KEEP, ["X", #896]		\n"	\
-	"	PRFM	PLDL1KEEP, ["Y", #896]		\n"	\
-	"	PRFM	PLDL1KEEP, ["X", #896+64]	\n"	\
-	"	PRFM	PLDL1KEEP, ["Y", #896+64]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[X_], #896]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[Y_], #896]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[X_], #896+64]	\n"	\
+	"	PRFM	PLDL1KEEP, [%[Y_], #896+64]	\n"	\
 	"	fmla	v6.2d, v22.2d, v30.2d		\n"	\
 	"	fmla	v7.2d, v23.2d, v31.2d		\n"
 
@@ -261,7 +257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	"	fadd	v0.2d, v0.2d, v2.2d		\n"	\
 	"	fadd	v4.2d, v4.2d, v6.2d		\n"	\
 	"	fadd	v0.2d, v0.2d, v4.2d		\n"	\
-	"	faddp	"DOTF", v0.2d			\n"
+	"	faddp	"OUT", v0.2d			\n"
 #endif /* !defined(DOUBLE) */
 
 #if defined(SMP)
@@ -272,17 +268,14 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
 
 static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
-	RETURN_TYPE  dot = 0.0 ;
+	RETURN_TYPE  dot = 0.0;
 
 	if ( n < 0 ) return dot;
 
+	BLASLONG j = 0;
+
 	__asm__ __volatile__ (
-	"	mov	"N", %[N_]			\n"
-	"	mov	"X", %[X_]			\n"
-	"	mov	"INC_X", %[INCX_]		\n"
-	"	mov	"Y", %[Y_]			\n"
-	"	mov	"INC_Y", %[INCY_]		\n"
-	"	fmov	"DOTF", "REG0"			\n"
+	"	fmov	"OUT", "REG0"			\n"
 	"	fmov	d1, xzr				\n"
 	"	fmov	d2, xzr				\n"
 	"	fmov	d3, xzr				\n"
@@ -290,42 +283,40 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B
 	"	fmov	d5, xzr				\n"
 	"	fmov	d6, xzr				\n"
 	"	fmov	d7, xzr				\n"
-	"	cmp	"N", xzr			\n"
-	"	ble	9f //dot_kernel_L999		\n"
-	"	cmp	"INC_X", #1			\n"
+	"	cmp	%[INCX_], #1			\n"
 	"	bne	5f //dot_kernel_S_BEGIN		\n"
-	"	cmp	"INC_Y", #1			\n"
+	"	cmp	%[INCY_], #1			\n"
 	"	bne	5f //dot_kernel_S_BEGIN		\n"
 
 	"1: //dot_kernel_F_BEGIN:			\n"
-	"	lsl	"INC_X", "INC_X", "INC_SHIFT"	\n"
-	"	lsl	"INC_Y", "INC_Y", "INC_SHIFT"	\n"
-	"	asr	"J", "N", #"N_DIV_SHIFT"	\n"
-	"	cmp	"J", xzr			\n"
+	"	lsl	%[INCX_], %[INCX_], "INC_SHIFT" \n"
+	"	lsl	%[INCY_], %[INCY_], "INC_SHIFT" \n"
+	"	asr	%[J_], %[N_], #"N_DIV_SHIFT"	\n"
+	"	cmp	%[J_], xzr			\n"
 	"	beq	3f //dot_kernel_F1		\n"
 
 	"	.align 5				\n"
 	"2: //dot_kernel_F:				\n"
 	"	"KERNEL_F"				\n"
-	"	subs	"J", "J", #1			\n"
+	"	subs	%[J_], %[J_], #1		\n"
 	"	bne	2b //dot_kernel_F		\n"
 	"	"KERNEL_F_FINALIZE"			\n"
 
 	"3: //dot_kernel_F1:				\n"
-	"	ands	"J", "N", #"N_REM_MASK"		\n"
+	"	ands	%[J_], %[N_], #"N_REM_MASK"	\n"
 	"	ble	9f //dot_kernel_L999		\n"
 
 	"4: //dot_kernel_F10:				\n"
 	"	"KERNEL_F1"				\n"
-	"	subs	"J", "J", #1			\n"
+	"	subs	%[J_], %[J_], #1		\n"
 	"	bne	4b //dot_kernel_F10		\n"
 	"	b	9f //dot_kernel_L999		\n"
 
 	"5: //dot_kernel_S_BEGIN:			\n"
-	"	lsl	"INC_X", "INC_X", "INC_SHIFT"	\n"
-	"	lsl	"INC_Y", "INC_Y", "INC_SHIFT"	\n"
-	"	asr	"J", "N", #2			\n"
-	"	cmp	"J", xzr			\n"
+	"	lsl	%[INCX_], %[INCX_], "INC_SHIFT"	\n"
+	"	lsl	%[INCY_], %[INCY_], "INC_SHIFT"	\n"
+	"	asr	%[J_], %[N_], #2		\n"
+	"	cmp	%[J_], xzr			\n"
 	"	ble	7f //dot_kernel_S1		\n"
 
 	"6: //dot_kernel_S4:				\n"
@@ -333,32 +324,30 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
 	"	"KERNEL_F1"				\n"
-	"	subs	"J", "J", #1			\n"
+	"	subs	%[J_], %[J_], #1		\n"
 	"	bne	6b //dot_kernel_S4		\n"
 
 	"7: //dot_kernel_S1:				\n"
-	"	ands	"J", "N", #3			\n"
+	"	ands	%[J_], %[N_], #3		\n"
 	"	ble	9f //dot_kernel_L999		\n"
 
 	"8: //dot_kernel_S10:				\n"
 	"	"KERNEL_F1"				\n"
-	"	subs	"J", "J", #1			\n"
+	"	subs	%[J_], %[J_], #1		\n"
 	"	bne	8b //dot_kernel_S10		\n"
 
 	"9: //dot_kernel_L999:				\n"
-	"	str	"DOTF", [%[DOT_]]		\n"
-
-	:
-	: [DOT_]  "r"  (&dot),		//%0
-	  [N_]    "r"  (n),		//%1
-	  [X_]    "r"  (x),		//%2
-	  [INCX_] "r"  (inc_x),		//%3
-	  [Y_]    "r"  (y),		//%4
-	  [INCY_] "r"  (inc_y)		//%5
+
+	: [DOT_]  "=&w" (dot)
+	: [N_]    "r"   (n),
+	  [X_]    "r"   (x),
+	  [INCX_] "r"   (inc_x),
+	  [Y_]    "r"   (y),
+	  [INCY_] "r"   (inc_y),
+          [J_]    "r"   (j)
 	: "cc",
 	  "memory",
-	  "x0", "x1", "x2", "x3", "x4", "x5",
-	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
+	  "d1", "d2", "d3", "d4", "d5", "d6", "d7"
 	);
 
 	return dot;

From bae45d94d13ee0e4b8e041b4a10d82198298409d Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bartoldeman@users.noreply.github.com>
Date: Tue, 29 Nov 2022 08:02:45 -0500
Subject: [PATCH 151/154] scal benchmark: eliminate y, move init/timing out of
 loop

Removing y avoids cache effects (if y is the size of the L1 cache, the
main array x is removed from it).
Moving init and timing out of the loop makes the scal benchmark behave like
the gemm benchmark, and allows higher accuracy for smaller test cases since
the loop overhead is much smaller than the timing overhead.

Example:
OPENBLAS_LOOPS=10000 ./dscal.goto 1024 8192 1024
on AMD Zen2 (7532) with 32k (4k doubles) L1 cache per core.

Before
From : 1024  To : 8192 Step = 1024 Inc_x = 1 Inc_y = 1 Loops = 10000
   SIZE       Flops
   1024 :     5627.08 MFlops   0.000000 sec
   2048 :     5907.34 MFlops   0.000000 sec
   3072 :     5553.30 MFlops   0.000001 sec
   4096 :     5446.38 MFlops   0.000001 sec
   5120 :     5504.61 MFlops   0.000001 sec
   6144 :     5501.80 MFlops   0.000001 sec
   7168 :     5547.43 MFlops   0.000001 sec
   8192 :     5548.46 MFlops   0.000001 sec

After
From : 1024  To : 8192 Step = 1024 Inc_x = 1 Inc_y = 1 Loops = 10000
   SIZE       Flops
   1024 :     6310.28 MFlops   0.000000 sec
   2048 :     6396.29 MFlops   0.000000 sec
   3072 :     6439.14 MFlops   0.000000 sec
   4096 :     6327.14 MFlops   0.000001 sec
   5120 :     5628.24 MFlops   0.000001 sec
   6144 :     5616.41 MFlops   0.000001 sec
   7168 :     5553.13 MFlops   0.000001 sec
   8192 :     5600.88 MFlops   0.000001 sec

We can see the L1->L2 switchover point is now where it should be, and the
number of flops for L1 is more accurate.
---
 benchmark/scal.c | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/benchmark/scal.c b/benchmark/scal.c
index 8de6cfd04..79bcb6729 100644
--- a/benchmark/scal.c
+++ b/benchmark/scal.c
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 int main(int argc, char *argv[]){
 
-  FLOAT *x, *y;
+  FLOAT *x;
   FLOAT alpha[2] = { 2.0, 2.0 };
   blasint m, i;
   blasint inc_x=1,inc_y=1;
@@ -74,10 +74,6 @@ int main(int argc, char *argv[]){
     fprintf(stderr,"Out of Memory!!\n");exit(1);
   }
 
-  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
-  }
-
 #ifdef __linux
   srandom(getpid());
 #endif
@@ -91,30 +87,20 @@ int main(int argc, char *argv[]){
 
    fprintf(stderr, " %6d : ", (int)m);
 
+   for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
+	x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+   }
 
+   begin();
    for (l=0; l<loops; l++)
    {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
-			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-    	begin();
-
     	SCAL (&m, alpha, x, &inc_x);
+   }
+   end();
 
-    	end();
-
-    	time1 = getsec();
-
-	timeg += time1;
-
-    }
+   time1 = getsec();
 
-    timeg /= loops;
+   timeg = time1 / loops;
 
 #ifdef COMPLEX
     fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg);

From 2fb096315e1cd1cd4fc6cffb2f410d4551086cac Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Fri, 25 Nov 2022 14:46:24 +0000
Subject: [PATCH 152/154] Set SWITCH_RATIO for Arm(R) Neoverse(TM) V1 CPUs

From testing this yields better results than the default of `2`.
---
 param.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/param.h b/param.h
index 514b13a3a..19cbe75a5 100644
--- a/param.h
+++ b/param.h
@@ -3367,6 +3367,8 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #elif defined(NEOVERSEV1)
 
+#define SWITCH_RATIO  16
+
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
 

From fd4f52c797328b4b134f62b5e97bbbba3e79426a Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Thu, 17 Nov 2022 06:49:57 +0000
Subject: [PATCH 153/154] Add SVE implementation for sdot/ddot

This adds an SVE implementation to sdot/ddot when available, falling back to the previous Advanced SIMD kernel where there's no SVE implementation for the kernel.

All the targets were essentially treating `dot_thunderx2t99.c` as the Advanced SIMD implementation so I've renamed it to better fit with the feature detection.
---
 Makefile.arm64                                |  10 +-
 getarch.c                                     |   2 +-
 kernel/arm64/KERNEL.NEOVERSEN1                |   4 +-
 kernel/arm64/KERNEL.NEOVERSEN2                |   4 +-
 kernel/arm64/KERNEL.NEOVERSEV1                |   4 +-
 kernel/arm64/KERNEL.THUNDERX2T99              |   4 +-
 kernel/arm64/KERNEL.THUNDERX3T110             |   4 +-
 kernel/arm64/dot.c                            | 111 ++++++++++++++++++
 ...{dot_thunderx2t99.c => dot_kernel_asimd.c} |  66 +----------
 kernel/arm64/dot_kernel_sve.c                 |  66 +++++++++++
 10 files changed, 194 insertions(+), 81 deletions(-)
 create mode 100644 kernel/arm64/dot.c
 rename kernel/arm64/{dot_thunderx2t99.c => dot_kernel_asimd.c} (87%)
 create mode 100644 kernel/arm64/dot_kernel_sve.c

diff --git a/Makefile.arm64 b/Makefile.arm64
index e2c471c2b..fc986f4c0 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -70,12 +70,12 @@ endif
 ifeq ($(CORE), NEOVERSEN1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
 ifeq ($(GCCVERSIONGTEQ9), 1)
-CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=neoverse-n1
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
 endif
 else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
 endif
@@ -94,12 +94,12 @@ ifeq ($(CORE), NEOVERSEV1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
 ifeq ($(GCCVERSIONGTEQ10), 1)
 ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
-CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
+CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
 endif
 else
-CCOMMON_OPT += -march=armv8.4-a -mtune=native
+CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.4-a -mtune=native
 endif
@@ -133,7 +133,7 @@ ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 endif
 else
-CCOMMON_OPT += -march=armv8.5-a -mtune=native
+CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.5-a -mtune=native
 endif
diff --git a/getarch.c b/getarch.c
index cde5b4e83..f26ca6325 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1410,7 +1410,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
        "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
-       "-march=armv8.4-a -mtune=neoverse-v1"
+       "-march=armv8.4-a+sve -mtune=neoverse-v1"
 #define LIBNAME   "neoversev1"
 #define CORENAME  "NEOVERSEV1"
 #endif
diff --git a/kernel/arm64/KERNEL.NEOVERSEN1 b/kernel/arm64/KERNEL.NEOVERSEN1
index ea010db42..9a5938459 100644
--- a/kernel/arm64/KERNEL.NEOVERSEN1
+++ b/kernel/arm64/KERNEL.NEOVERSEN1
@@ -96,8 +96,8 @@ DNRM2KERNEL    = dznrm2_thunderx2t99.c
 CNRM2KERNEL    = scnrm2_thunderx2t99.c
 ZNRM2KERNEL    = dznrm2_thunderx2t99.c
 
-DDOTKERNEL     = dot_thunderx2t99.c
-SDOTKERNEL     = dot_thunderx2t99.c
+DDOTKERNEL     = dot.c
+SDOTKERNEL     = dot.c
 CDOTKERNEL     = zdot_thunderx2t99.c
 ZDOTKERNEL     = zdot_thunderx2t99.c
 DSDOTKERNEL    = dot.S
diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2
index ae386d6e1..b743d1a43 100644
--- a/kernel/arm64/KERNEL.NEOVERSEN2
+++ b/kernel/arm64/KERNEL.NEOVERSEN2
@@ -96,8 +96,8 @@ DNRM2KERNEL    = dznrm2_thunderx2t99.c
 CNRM2KERNEL    = scnrm2_thunderx2t99.c
 ZNRM2KERNEL    = dznrm2_thunderx2t99.c
 
-DDOTKERNEL     = dot_thunderx2t99.c
-SDOTKERNEL     = dot_thunderx2t99.c
+DDOTKERNEL     = dot.c
+SDOTKERNEL     = dot.c
 CDOTKERNEL     = zdot_thunderx2t99.c
 ZDOTKERNEL     = zdot_thunderx2t99.c
 DSDOTKERNEL    = dot.S
diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1
index ea010db42..9a5938459 100644
--- a/kernel/arm64/KERNEL.NEOVERSEV1
+++ b/kernel/arm64/KERNEL.NEOVERSEV1
@@ -96,8 +96,8 @@ DNRM2KERNEL    = dznrm2_thunderx2t99.c
 CNRM2KERNEL    = scnrm2_thunderx2t99.c
 ZNRM2KERNEL    = dznrm2_thunderx2t99.c
 
-DDOTKERNEL     = dot_thunderx2t99.c
-SDOTKERNEL     = dot_thunderx2t99.c
+DDOTKERNEL     = dot.c
+SDOTKERNEL     = dot.c
 CDOTKERNEL     = zdot_thunderx2t99.c
 ZDOTKERNEL     = zdot_thunderx2t99.c
 DSDOTKERNEL    = dot.S
diff --git a/kernel/arm64/KERNEL.THUNDERX2T99 b/kernel/arm64/KERNEL.THUNDERX2T99
index a20d0d4a6..41cedc851 100644
--- a/kernel/arm64/KERNEL.THUNDERX2T99
+++ b/kernel/arm64/KERNEL.THUNDERX2T99
@@ -161,8 +161,8 @@ DNRM2KERNEL    = dznrm2_thunderx2t99.c
 ZNRM2KERNEL    = dznrm2_thunderx2t99.c
 
 
-DDOTKERNEL     = dot_thunderx2t99.c
-SDOTKERNEL     = dot_thunderx2t99.c
+DDOTKERNEL     = dot.c
+SDOTKERNEL     = dot.c
 CDOTKERNEL     = zdot_thunderx2t99.c
 ZDOTKERNEL     = zdot_thunderx2t99.c
 DSDOTKERNEL    = dot.S
diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110
index a20d0d4a6..41cedc851 100644
--- a/kernel/arm64/KERNEL.THUNDERX3T110
+++ b/kernel/arm64/KERNEL.THUNDERX3T110
@@ -161,8 +161,8 @@ DNRM2KERNEL    = dznrm2_thunderx2t99.c
 ZNRM2KERNEL    = dznrm2_thunderx2t99.c
 
 
-DDOTKERNEL     = dot_thunderx2t99.c
-SDOTKERNEL     = dot_thunderx2t99.c
+DDOTKERNEL     = dot.c
+SDOTKERNEL     = dot.c
 CDOTKERNEL     = zdot_thunderx2t99.c
 ZDOTKERNEL     = zdot_thunderx2t99.c
 DSDOTKERNEL    = dot.S
diff --git a/kernel/arm64/dot.c b/kernel/arm64/dot.c
new file mode 100644
index 000000000..094bce696
--- /dev/null
+++ b/kernel/arm64/dot.c
@@ -0,0 +1,111 @@
+/***************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+Copyright (c) 2022, Arm Ltd
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#ifdef HAVE_SVE
+#include "dot_kernel_sve.c"
+#endif
+#include "dot_kernel_asimd.c"
+
+#if defined(SMP)
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
+	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
+	void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+
+static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	RETURN_TYPE  dot = 0.0 ;
+
+	if ( n <= 0 ) return dot;
+
+#ifdef HAVE_SVE
+	if (inc_x == 1 && inc_y == 1) {
+		return dot_kernel_sve(n, x, y);
+	}
+#endif
+
+	return dot_kernel_asimd(n, x, inc_x, y, inc_y);
+}
+
+#if defined(SMP)
+static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
+	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
+{
+	*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);
+
+	return 0;
+}
+#endif
+
+RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+#if defined(SMP)
+	int nthreads;
+	FLOAT dummy_alpha;
+#endif
+	RETURN_TYPE dot = 0.0;
+
+#if defined(SMP)
+	if (inc_x == 0 || inc_y == 0 || n <= 10000)
+		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
+
+	if (nthreads == 1) {
+		dot = dot_compute(n, x, inc_x, y, inc_y);
+	} else {
+		int mode, i;
+		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
+		RETURN_TYPE *ptr;
+
+#if !defined(DOUBLE)
+		mode = BLAS_SINGLE  | BLAS_REAL;
+#else
+		mode = BLAS_DOUBLE  | BLAS_REAL;
+#endif
+
+		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
+				   x, inc_x, y, inc_y, result, 0,
+				   ( void *)dot_thread_function, nthreads);
+
+		ptr = (RETURN_TYPE *)result;
+		for (i = 0; i < nthreads; i++) {
+			dot = dot + (*ptr);
+			ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
+		}
+	}
+#else
+	dot = dot_compute(n, x, inc_x, y, inc_y);
+#endif
+
+	return dot;
+}
diff --git a/kernel/arm64/dot_thunderx2t99.c b/kernel/arm64/dot_kernel_asimd.c
similarity index 87%
rename from kernel/arm64/dot_thunderx2t99.c
rename to kernel/arm64/dot_kernel_asimd.c
index 9131f1e86..1288838f8 100644
--- a/kernel/arm64/dot_thunderx2t99.c
+++ b/kernel/arm64/dot_kernel_asimd.c
@@ -260,18 +260,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	"	faddp	"OUT", v0.2d			\n"
 #endif /* !defined(DOUBLE) */
 
-#if defined(SMP)
-extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
-	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
-	void *c, BLASLONG ldc, int (*function)(), int nthreads);
-#endif
-
-static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	RETURN_TYPE  dot = 0.0;
-
-	if ( n < 0 ) return dot;
-
 	BLASLONG j = 0;
 
 	__asm__ __volatile__ (
@@ -352,58 +343,3 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B
 
 	return dot;
 }
-
-#if defined(SMP)
-static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
-	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
-	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
-{
-	*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);
-
-	return 0;
-}
-#endif
-
-RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
-{
-#if defined(SMP)
-	int nthreads;
-	FLOAT dummy_alpha;
-#endif
-	RETURN_TYPE dot = 0.0;
-
-#if defined(SMP)
-	if (inc_x == 0 || inc_y == 0 || n <= 10000)
-		nthreads = 1;
-	else
-		nthreads = num_cpu_avail(1);
-
-	if (nthreads == 1) {
-		dot = dot_compute(n, x, inc_x, y, inc_y);
-	} else {
-		int mode, i;
-		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
-		RETURN_TYPE *ptr;
-
-#if !defined(DOUBLE)
-		mode = BLAS_SINGLE  | BLAS_REAL;
-#else
-		mode = BLAS_DOUBLE  | BLAS_REAL;
-#endif
-
-		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
-				   x, inc_x, y, inc_y, result, 0,
-				   ( void *)dot_thread_function, nthreads);
-
-		ptr = (RETURN_TYPE *)result;
-		for (i = 0; i < nthreads; i++) {
-			dot = dot + (*ptr);
-			ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
-		}
-	}
-#else
-	dot = dot_compute(n, x, inc_x, y, inc_y);
-#endif
-
-	return dot;
-}
diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c
new file mode 100644
index 000000000..8460e0d5e
--- /dev/null
+++ b/kernel/arm64/dot_kernel_sve.c
@@ -0,0 +1,66 @@
+/***************************************************************************
+Copyright (c) 2022, Arm Ltd
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#include <arm_sve.h>
+
+#ifdef DOUBLE
+#define SVE_TYPE svfloat64_t
+#define SVE_ZERO svdup_f64(0.0)
+#define SVE_WHILELT svwhilelt_b64
+#define SVE_ALL svptrue_b64()
+#define SVE_WIDTH svcntd()
+#else
+#define SVE_TYPE svfloat32_t
+#define SVE_ZERO svdup_f32(0.0)
+#define SVE_WHILELT svwhilelt_b32
+#define SVE_ALL svptrue_b32()
+#define SVE_WIDTH svcntw()
+#endif
+
+static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
+        SVE_TYPE acc_a = SVE_ZERO;
+        SVE_TYPE acc_b = SVE_ZERO;
+
+        BLASLONG sve_width = SVE_WIDTH;
+
+        for (BLASLONG i = 0; i < n; i += sve_width * 2) {
+                svbool_t pg_a = SVE_WHILELT(i, n);
+                svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
+
+                SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
+                SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
+                SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]);
+                SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]);
+
+                acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a);
+                acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b);
+        }
+
+        return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b);
+}

From eea006a6886856bc1b89817052a8c471234d9c6b Mon Sep 17 00:00:00 2001
From: Chris Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 29 Nov 2022 17:53:38 +0000
Subject: [PATCH 154/154] Wrap SVE header with __has_include check

---
 kernel/arm64/dot.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/arm64/dot.c b/kernel/arm64/dot.c
index 094bce696..4607ebc59 100644
--- a/kernel/arm64/dot.c
+++ b/kernel/arm64/dot.c
@@ -29,7 +29,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+// Some compilers will report feature support for SVE without the appropriate
+// header available
 #ifdef HAVE_SVE
+#if defined __has_include 
+#if __has_include(<arm_sve.h>) && __ARM_FEATURE_SVE
+#define USE_SVE
+#endif 
+#endif
+#endif
+
+#ifdef USE_SVE
 #include "dot_kernel_sve.c"
 #endif
 #include "dot_kernel_asimd.c"
@@ -46,7 +56,7 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B
 
 	if ( n <= 0 ) return dot;
 
-#ifdef HAVE_SVE
+#ifdef USE_SVE
 	if (inc_x == 1 && inc_y == 1) {
 		return dot_kernel_sve(n, x, y);
 	}